Index: projects/clang360-import/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- projects/clang360-import/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h (revision 279024) +++ projects/clang360-import/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h (revision 279025) @@ -1,1284 +1,1286 @@ //===-- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file declares the SelectionDAG class, and transitively defines the // SDNode class and subclasses. // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_SELECTIONDAG_H #define LLVM_CODEGEN_SELECTIONDAG_H #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/ilist.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Target/TargetMachine.h" #include #include #include #include namespace llvm { class AliasAnalysis; class MachineConstantPoolValue; class MachineFunction; class MDNode; class SDDbgValue; class TargetLowering; class TargetSelectionDAGInfo; class SDVTListNode : public FoldingSetNode { friend struct FoldingSetTrait; /// FastID - A reference to an Interned FoldingSetNodeID for this node. /// The Allocator in SelectionDAG holds the data. /// SDVTList contains all types which are frequently accessed in SelectionDAG. /// The size of this list is not expected big so it won't introduce memory penalty. FoldingSetNodeIDRef FastID; const EVT *VTs; unsigned int NumVTs; /// The hash value for SDVTList is fixed so cache it to avoid hash calculation unsigned HashValue; public: SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) : FastID(ID), VTs(VT), NumVTs(Num) { HashValue = ID.ComputeHash(); } SDVTList getSDVTList() { SDVTList result = {VTs, NumVTs}; return result; } }; // Specialize FoldingSetTrait for SDVTListNode // To avoid computing temp FoldingSetNodeID and hash value. template<> struct FoldingSetTrait : DefaultFoldingSetTrait { static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) { ID = X.FastID; } static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID, unsigned IDHash, FoldingSetNodeID &TempID) { if (X.HashValue != IDHash) return false; return ID == X.FastID; } static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) { return X.HashValue; } }; template<> struct ilist_traits : public ilist_default_traits { private: mutable ilist_half_node Sentinel; public: SDNode *createSentinel() const { return static_cast(&Sentinel); } static void destroySentinel(SDNode *) {} SDNode *provideInitialHead() const { return createSentinel(); } SDNode *ensureHead(SDNode*) const { return createSentinel(); } static void noteHead(SDNode*, SDNode*) {} static void deleteNode(SDNode *) { llvm_unreachable("ilist_traits shouldn't see a deleteNode call!"); } private: static void createNode(const SDNode &); }; /// SDDbgInfo - Keeps track of dbg_value information through SDISel. We do /// not build SDNodes for these so as not to perturb the generated code; /// instead the info is kept off to the side in this structure. Each SDNode may /// have one or more associated dbg_value entries. This information is kept in /// DbgValMap. /// Byval parameters are handled separately because they don't use alloca's, /// which busts the normal mechanism. There is good reason for handling all /// parameters separately: they may not have code generated for them, they /// should always go at the beginning of the function regardless of other code /// motion, and debug info for them is potentially useful even if the parameter /// is unused. Right now only byval parameters are handled separately. class SDDbgInfo { SmallVector DbgValues; SmallVector ByvalParmDbgValues; typedef DenseMap > DbgValMapType; DbgValMapType DbgValMap; void operator=(const SDDbgInfo&) LLVM_DELETED_FUNCTION; SDDbgInfo(const SDDbgInfo&) LLVM_DELETED_FUNCTION; public: SDDbgInfo() {} void add(SDDbgValue *V, const SDNode *Node, bool isParameter) { if (isParameter) { ByvalParmDbgValues.push_back(V); } else DbgValues.push_back(V); if (Node) DbgValMap[Node].push_back(V); } /// \brief Invalidate all DbgValues attached to the node and remove /// it from the Node-to-DbgValues map. void erase(const SDNode *Node); void clear() { DbgValMap.clear(); DbgValues.clear(); ByvalParmDbgValues.clear(); } bool empty() const { return DbgValues.empty() && ByvalParmDbgValues.empty(); } ArrayRef getSDDbgValues(const SDNode *Node) { DbgValMapType::iterator I = DbgValMap.find(Node); if (I != DbgValMap.end()) return I->second; return ArrayRef(); } typedef SmallVectorImpl::iterator DbgIterator; DbgIterator DbgBegin() { return DbgValues.begin(); } DbgIterator DbgEnd() { return DbgValues.end(); } DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); } DbgIterator ByvalParmDbgEnd() { return ByvalParmDbgValues.end(); } }; class SelectionDAG; void checkForCycles(const SelectionDAG *DAG, bool force = false); /// SelectionDAG class - This is used to represent a portion of an LLVM function /// in a low-level Data Dependence DAG representation suitable for instruction /// selection. This DAG is constructed as the first step of instruction /// selection in order to allow implementation of machine specific optimizations /// and code simplifications. /// /// The representation used by the SelectionDAG is a target-independent /// representation, which has some similarities to the GCC RTL representation, /// but is significantly more simple, powerful, and is a graph form instead of a /// linear form. /// class SelectionDAG { const TargetMachine &TM; const TargetSelectionDAGInfo *TSI; const TargetLowering *TLI; MachineFunction *MF; LLVMContext *Context; CodeGenOpt::Level OptLevel; /// EntryNode - The starting token. SDNode EntryNode; /// Root - The root of the entire DAG. SDValue Root; /// AllNodes - A linked list of nodes in the current DAG. ilist AllNodes; /// NodeAllocatorType - The AllocatorType for allocating SDNodes. We use /// pool allocation with recycling. typedef RecyclingAllocator::Alignment> NodeAllocatorType; /// NodeAllocator - Pool allocation for nodes. NodeAllocatorType NodeAllocator; /// CSEMap - This structure is used to memoize nodes, automatically performing /// CSE with existing nodes when a duplicate is requested. FoldingSet CSEMap; /// OperandAllocator - Pool allocation for machine-opcode SDNode operands. BumpPtrAllocator OperandAllocator; /// Allocator - Pool allocation for misc. objects that are created once per /// SelectionDAG. BumpPtrAllocator Allocator; /// DbgInfo - Tracks dbg_value information through SDISel. SDDbgInfo *DbgInfo; public: /// DAGUpdateListener - Clients of various APIs that cause global effects on /// the DAG can optionally implement this interface. This allows the clients /// to handle the various sorts of updates that happen. /// /// A DAGUpdateListener automatically registers itself with DAG when it is /// constructed, and removes itself when destroyed in RAII fashion. struct DAGUpdateListener { DAGUpdateListener *const Next; SelectionDAG &DAG; explicit DAGUpdateListener(SelectionDAG &D) : Next(D.UpdateListeners), DAG(D) { DAG.UpdateListeners = this; } virtual ~DAGUpdateListener() { assert(DAG.UpdateListeners == this && "DAGUpdateListeners must be destroyed in LIFO order"); DAG.UpdateListeners = Next; } /// NodeDeleted - The node N that was deleted and, if E is not null, an /// equivalent node E that replaced it. virtual void NodeDeleted(SDNode *N, SDNode *E); /// NodeUpdated - The node N that was updated. virtual void NodeUpdated(SDNode *N); }; /// NewNodesMustHaveLegalTypes - When true, additional steps are taken to /// ensure that getConstant() and similar functions return DAG nodes that /// have legal types. This is important after type legalization since /// any illegally typed nodes generated after this point will not experience /// type legalization. bool NewNodesMustHaveLegalTypes; private: /// DAGUpdateListener is a friend so it can manipulate the listener stack. friend struct DAGUpdateListener; /// UpdateListeners - Linked list of registered DAGUpdateListener instances. /// This stack is maintained by DAGUpdateListener RAII. DAGUpdateListener *UpdateListeners; /// setGraphColorHelper - Implementation of setSubgraphColor. /// Return whether we had to truncate the search. /// bool setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet &visited, int level, bool &printed); void operator=(const SelectionDAG&) LLVM_DELETED_FUNCTION; SelectionDAG(const SelectionDAG&) LLVM_DELETED_FUNCTION; public: explicit SelectionDAG(const TargetMachine &TM, llvm::CodeGenOpt::Level); ~SelectionDAG(); /// init - Prepare this SelectionDAG to process code in the given /// MachineFunction. /// void init(MachineFunction &mf); /// clear - Clear state and free memory necessary to make this /// SelectionDAG ready to process a new block. /// void clear(); MachineFunction &getMachineFunction() const { return *MF; } const TargetMachine &getTarget() const { return TM; } const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); } const TargetLowering &getTargetLoweringInfo() const { return *TLI; } const TargetSelectionDAGInfo &getSelectionDAGInfo() const { return *TSI; } LLVMContext *getContext() const {return Context; } /// viewGraph - Pop up a GraphViz/gv window with the DAG rendered using 'dot'. /// void viewGraph(const std::string &Title); void viewGraph(); #ifndef NDEBUG std::map NodeGraphAttrs; #endif /// clearGraphAttrs - Clear all previously defined node graph attributes. /// Intended to be used from a debugging tool (eg. gdb). void clearGraphAttrs(); /// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".) /// void setGraphAttrs(const SDNode *N, const char *Attrs); /// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".) /// Used from getNodeAttributes. const std::string getGraphAttrs(const SDNode *N) const; /// setGraphColor - Convenience for setting node color attribute. /// void setGraphColor(const SDNode *N, const char *Color); /// setGraphColor - Convenience for setting subgraph color attribute. /// void setSubgraphColor(SDNode *N, const char *Color); typedef ilist::const_iterator allnodes_const_iterator; allnodes_const_iterator allnodes_begin() const { return AllNodes.begin(); } allnodes_const_iterator allnodes_end() const { return AllNodes.end(); } typedef ilist::iterator allnodes_iterator; allnodes_iterator allnodes_begin() { return AllNodes.begin(); } allnodes_iterator allnodes_end() { return AllNodes.end(); } ilist::size_type allnodes_size() const { return AllNodes.size(); } /// getRoot - Return the root tag of the SelectionDAG. /// const SDValue &getRoot() const { return Root; } /// getEntryNode - Return the token chain corresponding to the entry of the /// function. SDValue getEntryNode() const { return SDValue(const_cast(&EntryNode), 0); } /// setRoot - Set the current root tag of the SelectionDAG. /// const SDValue &setRoot(SDValue N) { assert((!N.getNode() || N.getValueType() == MVT::Other) && "DAG root value is not a chain!"); if (N.getNode()) checkForCycles(N.getNode(), this); Root = N; if (N.getNode()) checkForCycles(this); return Root; } /// Combine - This iterates over the nodes in the SelectionDAG, folding /// certain types of nodes together, or eliminating superfluous nodes. The /// Level argument controls whether Combine is allowed to produce nodes and /// types that are illegal on the target. void Combine(CombineLevel Level, AliasAnalysis &AA, CodeGenOpt::Level OptLevel); /// LegalizeTypes - This transforms the SelectionDAG into a SelectionDAG that /// only uses types natively supported by the target. Returns "true" if it /// made any changes. /// /// Note that this is an involved process that may invalidate pointers into /// the graph. bool LegalizeTypes(); /// Legalize - This transforms the SelectionDAG into a SelectionDAG that is /// compatible with the target instruction selector, as indicated by the /// TargetLowering object. /// /// Note that this is an involved process that may invalidate pointers into /// the graph. void Legalize(); /// \brief Transforms a SelectionDAG node and any operands to it into a node /// that is compatible with the target instruction selector, as indicated by /// the TargetLowering object. /// /// \returns true if \c N is a valid, legal node after calling this. /// /// This essentially runs a single recursive walk of the \c Legalize process /// over the given node (and its operands). This can be used to incrementally /// legalize the DAG. All of the nodes which are directly replaced, /// potentially including N, are added to the output parameter \c /// UpdatedNodes so that the delta to the DAG can be understood by the /// caller. /// /// When this returns false, N has been legalized in a way that make the /// pointer passed in no longer valid. It may have even been deleted from the /// DAG, and so it shouldn't be used further. When this returns true, the /// N passed in is a legal node, and can be immediately processed as such. /// This may still have done some work on the DAG, and will still populate /// UpdatedNodes with any new nodes replacing those originally in the DAG. bool LegalizeOp(SDNode *N, SmallSetVector &UpdatedNodes); /// LegalizeVectors - This transforms the SelectionDAG into a SelectionDAG /// that only uses vector math operations supported by the target. This is /// necessary as a separate step from Legalize because unrolling a vector /// operation can introduce illegal types, which requires running /// LegalizeTypes again. /// /// This returns true if it made any changes; in that case, LegalizeTypes /// is called again before Legalize. /// /// Note that this is an involved process that may invalidate pointers into /// the graph. bool LegalizeVectors(); /// RemoveDeadNodes - This method deletes all unreachable nodes in the /// SelectionDAG. void RemoveDeadNodes(); /// DeleteNode - Remove the specified node from the system. This node must /// have no referrers. void DeleteNode(SDNode *N); /// getVTList - Return an SDVTList that represents the list of values /// specified. SDVTList getVTList(EVT VT); SDVTList getVTList(EVT VT1, EVT VT2); SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3); SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4); SDVTList getVTList(ArrayRef VTs); //===--------------------------------------------------------------------===// // Node creation methods. // SDValue getConstant(uint64_t Val, EVT VT, bool isTarget = false, bool isOpaque = false); SDValue getConstant(const APInt &Val, EVT VT, bool isTarget = false, bool isOpaque = false); SDValue getConstant(const ConstantInt &Val, EVT VT, bool isTarget = false, bool isOpaque = false); SDValue getIntPtrConstant(uint64_t Val, bool isTarget = false); SDValue getTargetConstant(uint64_t Val, EVT VT, bool isOpaque = false) { return getConstant(Val, VT, true, isOpaque); } SDValue getTargetConstant(const APInt &Val, EVT VT, bool isOpaque = false) { return getConstant(Val, VT, true, isOpaque); } SDValue getTargetConstant(const ConstantInt &Val, EVT VT, bool isOpaque = false) { return getConstant(Val, VT, true, isOpaque); } // The forms below that take a double should only be used for simple // constants that can be exactly represented in VT. No checks are made. SDValue getConstantFP(double Val, EVT VT, bool isTarget = false); SDValue getConstantFP(const APFloat& Val, EVT VT, bool isTarget = false); SDValue getConstantFP(const ConstantFP &CF, EVT VT, bool isTarget = false); SDValue getTargetConstantFP(double Val, EVT VT) { return getConstantFP(Val, VT, true); } SDValue getTargetConstantFP(const APFloat& Val, EVT VT) { return getConstantFP(Val, VT, true); } SDValue getTargetConstantFP(const ConstantFP &Val, EVT VT) { return getConstantFP(Val, VT, true); } SDValue getGlobalAddress(const GlobalValue *GV, SDLoc DL, EVT VT, int64_t offset = 0, bool isTargetGA = false, unsigned char TargetFlags = 0); SDValue getTargetGlobalAddress(const GlobalValue *GV, SDLoc DL, EVT VT, int64_t offset = 0, unsigned char TargetFlags = 0) { return getGlobalAddress(GV, DL, VT, offset, true, TargetFlags); } SDValue getFrameIndex(int FI, EVT VT, bool isTarget = false); SDValue getTargetFrameIndex(int FI, EVT VT) { return getFrameIndex(FI, VT, true); } SDValue getJumpTable(int JTI, EVT VT, bool isTarget = false, unsigned char TargetFlags = 0); SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags = 0) { return getJumpTable(JTI, VT, true, TargetFlags); } SDValue getConstantPool(const Constant *C, EVT VT, unsigned Align = 0, int Offs = 0, bool isT=false, unsigned char TargetFlags = 0); SDValue getTargetConstantPool(const Constant *C, EVT VT, unsigned Align = 0, int Offset = 0, unsigned char TargetFlags = 0) { return getConstantPool(C, VT, Align, Offset, true, TargetFlags); } SDValue getConstantPool(MachineConstantPoolValue *C, EVT VT, unsigned Align = 0, int Offs = 0, bool isT=false, unsigned char TargetFlags = 0); SDValue getTargetConstantPool(MachineConstantPoolValue *C, EVT VT, unsigned Align = 0, int Offset = 0, unsigned char TargetFlags=0) { return getConstantPool(C, VT, Align, Offset, true, TargetFlags); } SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0, unsigned char TargetFlags = 0); // When generating a branch to a BB, we don't in general know enough // to provide debug info for the BB at that time, so keep this one around. SDValue getBasicBlock(MachineBasicBlock *MBB); SDValue getBasicBlock(MachineBasicBlock *MBB, SDLoc dl); SDValue getExternalSymbol(const char *Sym, EVT VT); SDValue getExternalSymbol(const char *Sym, SDLoc dl, EVT VT); SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned char TargetFlags = 0); SDValue getValueType(EVT); SDValue getRegister(unsigned Reg, EVT VT); SDValue getRegisterMask(const uint32_t *RegMask); SDValue getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label); SDValue getBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset = 0, bool isTarget = false, unsigned char TargetFlags = 0); SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset = 0, unsigned char TargetFlags = 0) { return getBlockAddress(BA, VT, Offset, true, TargetFlags); } SDValue getCopyToReg(SDValue Chain, SDLoc dl, unsigned Reg, SDValue N) { return getNode(ISD::CopyToReg, dl, MVT::Other, Chain, getRegister(Reg, N.getValueType()), N); } // This version of the getCopyToReg method takes an extra operand, which // indicates that there is potentially an incoming glue value (if Glue is not // null) and that there should be a glue result. SDValue getCopyToReg(SDValue Chain, SDLoc dl, unsigned Reg, SDValue N, SDValue Glue) { SDVTList VTs = getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue }; return getNode(ISD::CopyToReg, dl, VTs, ArrayRef(Ops, Glue.getNode() ? 4 : 3)); } // Similar to last getCopyToReg() except parameter Reg is a SDValue SDValue getCopyToReg(SDValue Chain, SDLoc dl, SDValue Reg, SDValue N, SDValue Glue) { SDVTList VTs = getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Reg, N, Glue }; return getNode(ISD::CopyToReg, dl, VTs, ArrayRef(Ops, Glue.getNode() ? 4 : 3)); } SDValue getCopyFromReg(SDValue Chain, SDLoc dl, unsigned Reg, EVT VT) { SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, getRegister(Reg, VT) }; return getNode(ISD::CopyFromReg, dl, VTs, Ops); } // This version of the getCopyFromReg method takes an extra operand, which // indicates that there is potentially an incoming glue value (if Glue is not // null) and that there should be a glue result. SDValue getCopyFromReg(SDValue Chain, SDLoc dl, unsigned Reg, EVT VT, SDValue Glue) { SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue }; return getNode(ISD::CopyFromReg, dl, VTs, ArrayRef(Ops, Glue.getNode() ? 3 : 2)); } SDValue getCondCode(ISD::CondCode Cond); /// Returns the ConvertRndSat Note: Avoid using this node because it may /// disappear in the future and most targets don't support it. SDValue getConvertRndSat(EVT VT, SDLoc dl, SDValue Val, SDValue DTy, SDValue STy, SDValue Rnd, SDValue Sat, ISD::CvtCode Code); /// getVectorShuffle - Return an ISD::VECTOR_SHUFFLE node. The number of /// elements in VT, which must be a vector type, must match the number of /// mask elements NumElts. A integer mask element equal to -1 is treated as /// undefined. SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2, const int *MaskElts); SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2, ArrayRef MaskElts) { assert(VT.getVectorNumElements() == MaskElts.size() && "Must have the same number of vector elements as mask elements!"); return getVectorShuffle(VT, dl, N1, N2, MaskElts.data()); } /// \brief Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to /// the shuffle node in input but with swapped operands. /// /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3> SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV); /// getAnyExtOrTrunc - Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT); /// getSExtOrTrunc - Convert Op, which must be of integer type, to the /// integer type VT, by either sign-extending or truncating it. SDValue getSExtOrTrunc(SDValue Op, SDLoc DL, EVT VT); /// getZExtOrTrunc - Convert Op, which must be of integer type, to the /// integer type VT, by either zero-extending or truncating it. SDValue getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT); /// getZeroExtendInReg - Return the expression required to zero extend the Op /// value assuming it was the smaller SrcTy value. SDValue getZeroExtendInReg(SDValue Op, SDLoc DL, EVT SrcTy); /// getAnyExtendVectorInReg - Return an operation which will any-extend the /// low lanes of the operand into the specified vector type. For example, /// this can convert a v16i8 into a v4i32 by any-extending the low four /// lanes of the operand from i8 to i32. SDValue getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT); /// getSignExtendVectorInReg - Return an operation which will sign extend the /// low lanes of the operand into the specified vector type. For example, /// this can convert a v16i8 into a v4i32 by sign extending the low four /// lanes of the operand from i8 to i32. SDValue getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT); /// getZeroExtendVectorInReg - Return an operation which will zero extend the /// low lanes of the operand into the specified vector type. For example, /// this can convert a v16i8 into a v4i32 by zero extending the low four /// lanes of the operand from i8 to i32. SDValue getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT); /// getBoolExtOrTrunc - Convert Op, which must be of integer type, to the /// integer type VT, by using an extension appropriate for the target's /// BooleanContent for type OpVT or truncating it. SDValue getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT, EVT OpVT); /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). SDValue getNOT(SDLoc DL, SDValue Val, EVT VT); /// \brief Create a logical NOT operation as (XOR Val, BooleanOne). SDValue getLogicalNOT(SDLoc DL, SDValue Val, EVT VT); /// getCALLSEQ_START - Return a new CALLSEQ_START node, which always must have /// a glue result (to ensure it's not CSE'd). CALLSEQ_START does not have a /// useful SDLoc. SDValue getCALLSEQ_START(SDValue Chain, SDValue Op, SDLoc DL) { SDVTList VTs = getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Op }; return getNode(ISD::CALLSEQ_START, DL, VTs, Ops); } /// getCALLSEQ_END - Return a new CALLSEQ_END node, which always must have a /// glue result (to ensure it's not CSE'd). CALLSEQ_END does not have /// a useful SDLoc. SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, SDLoc DL) { SDVTList NodeTys = getVTList(MVT::Other, MVT::Glue); SmallVector Ops; Ops.push_back(Chain); Ops.push_back(Op1); Ops.push_back(Op2); if (InGlue.getNode()) Ops.push_back(InGlue); return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops); } /// getUNDEF - Return an UNDEF node. UNDEF does not have a useful SDLoc. SDValue getUNDEF(EVT VT) { return getNode(ISD::UNDEF, SDLoc(), VT); } /// getGLOBAL_OFFSET_TABLE - Return a GLOBAL_OFFSET_TABLE node. This does /// not have a useful SDLoc. SDValue getGLOBAL_OFFSET_TABLE(EVT VT) { return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT); } /// getNode - Gets or creates the specified node. /// SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, bool nuw = false, bool nsw = false, bool exact = false); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4, SDValue N5); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops); SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops); SDValue getNode(unsigned Opcode, SDLoc DL, ArrayRef ResultTys, ArrayRef Ops); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, ArrayRef Ops); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2, SDValue N3); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2, SDValue N3, SDValue N4); SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2, SDValue N3, SDValue N4, SDValue N5); /// getStackArgumentTokenFactor - Compute a TokenFactor to force all /// the incoming stack arguments to be loaded from the stack. This is /// used in tail call lowering to protect stack arguments from being /// clobbered. SDValue getStackArgumentTokenFactor(SDValue Chain); SDValue getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo); SDValue getMemmove(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo); SDValue getMemset(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo); /// getSetCC - Helper function to make it easier to build SetCC's if you just /// have an ISD::CondCode instead of an SDValue. /// SDValue getSetCC(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond) { assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() && "Cannot compare scalars to vectors"); assert(LHS.getValueType().isVector() == VT.isVector() && "Cannot compare scalars to vectors"); assert(Cond != ISD::SETCC_INVALID && "Cannot create a setCC of an invalid node."); return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond)); } // getSelect - Helper function to make it easier to build Select's if you just // have operands and don't want to check for vector. SDValue getSelect(SDLoc DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS) { assert(LHS.getValueType() == RHS.getValueType() && "Cannot use select on differing types"); assert(VT.isVector() == LHS.getValueType().isVector() && "Cannot mix vectors and scalars"); return getNode(Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, Cond, LHS, RHS); } /// getSelectCC - Helper function to make it easier to build SelectCC's if you /// just have an ISD::CondCode instead of an SDValue. /// SDValue getSelectCC(SDLoc DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond) { return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True, False, getCondCode(Cond)); } /// getVAArg - VAArg produces a result and token chain, and takes a pointer /// and a source value as input. SDValue getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue SV, unsigned Align); /// getAtomicCmpSwap - Gets a node for an atomic cmpxchg op. There are two /// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces the value loaded and a /// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded, /// a success flag (initially i1), and a chain. SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo, unsigned Alignment, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope); SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope); /// getAtomic - Gets a node for an atomic op, produces result (if relevant) /// and chain and takes 2 operands. SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, const Value *PtrVal, unsigned Alignment, AtomicOrdering Ordering, SynchronizationScope SynchScope); SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope); /// getAtomic - Gets a node for an atomic op, produces result and chain and /// takes 1 operand. SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope); /// getAtomic - Gets a node for an atomic op, produces result and chain and /// takes N operands. SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList, ArrayRef Ops, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope); SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList, ArrayRef Ops, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope); /// getMemIntrinsicNode - Creates a MemIntrinsicNode that may produce a /// result and takes a list of operands. Opcode may be INTRINSIC_VOID, /// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not /// less than FIRST_TARGET_MEMORY_OPCODE. SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align = 0, bool Vol = false, bool ReadMem = true, bool WriteMem = true, unsigned Size = 0); SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachineMemOperand *MMO); /// getMergeValues - Create a MERGE_VALUES node from the given operands. SDValue getMergeValues(ArrayRef Ops, SDLoc dl); /// getLoad - Loads are not normal binary operators: their result type is not /// determined by their operands, and they produce a value AND a token chain. /// SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr); SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO); SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo = AAMDNodes()); SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO); SDValue getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM); SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Offset, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr); SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Offset, EVT MemVT, MachineMemOperand *MMO); /// getStore - Helper function to build ISD::STORE nodes. /// SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, unsigned Alignment, const AAMDNodes &AAInfo = AAMDNodes()); SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachineMemOperand *MMO); SDValue getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT TVT, bool isNonTemporal, bool isVolatile, unsigned Alignment, const AAMDNodes &AAInfo = AAMDNodes()); SDValue getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, EVT TVT, MachineMemOperand *MMO); SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM); SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, - SDValue Mask, SDValue Src0, MachineMemOperand *MMO); + SDValue Mask, SDValue Src0, EVT MemVT, + MachineMemOperand *MMO, ISD::LoadExtType); SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, - SDValue Ptr, SDValue Mask, MachineMemOperand *MMO); + SDValue Ptr, SDValue Mask, EVT MemVT, + MachineMemOperand *MMO, bool IsTrunc); /// getSrcValue - Construct a node to track a Value* through the backend. SDValue getSrcValue(const Value *v); /// getMDNode - Return an MDNodeSDNode which holds an MDNode. SDValue getMDNode(const MDNode *MD); /// getAddrSpaceCast - Return an AddrSpaceCastSDNode. SDValue getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS); /// getShiftAmountOperand - Return the specified value casted to /// the target's desired shift amount type. SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op); /// UpdateNodeOperands - *Mutate* the specified node in-place to have the /// specified operands. If the resultant node already exists in the DAG, /// this does not modify the specified node, instead it returns the node that /// already exists. If the resultant node does not exist in the DAG, the /// input node is returned. As a degenerate case, if you specify the same /// input operands as the node already has, the input node is returned. SDNode *UpdateNodeOperands(SDNode *N, SDValue Op); SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2); SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3); SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3, SDValue Op4); SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3, SDValue Op4, SDValue Op5); SDNode *UpdateNodeOperands(SDNode *N, ArrayRef Ops); /// SelectNodeTo - These are used for target selectors to *mutate* the /// specified node to have the specified return type, Target opcode, and /// operands. Note that target opcodes are stored as /// ~TargetOpcode in the node opcode field. The resultant node is returned. SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT, SDValue Op1); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT, SDValue Op1, SDValue Op2); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT, SDValue Op1, SDValue Op2, SDValue Op3); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT, ArrayRef Ops); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2, ArrayRef Ops); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2, EVT VT3, ArrayRef Ops); SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, EVT VT4, ArrayRef Ops); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2, SDValue Op1); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3); SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, SDVTList VTs, ArrayRef Ops); /// MorphNodeTo - This *mutates* the specified node to have the specified /// return type, opcode, and operands. SDNode *MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef Ops); /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. /// /// Note that getMachineNode returns the resultant node. If there is already /// a node of the specified opcode and operands, it returns that node instead /// of the current one. MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1, SDValue Op2); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1, SDValue Op2, SDValue Op3); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, ArrayRef Ops); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, ArrayRef Ops); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, ArrayRef Ops); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, EVT VT4, ArrayRef Ops); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, ArrayRef ResultTys, ArrayRef Ops); MachineSDNode *getMachineNode(unsigned Opcode, SDLoc dl, SDVTList VTs, ArrayRef Ops); /// getTargetExtractSubreg - A convenience function for creating /// TargetInstrInfo::EXTRACT_SUBREG nodes. SDValue getTargetExtractSubreg(int SRIdx, SDLoc DL, EVT VT, SDValue Operand); /// getTargetInsertSubreg - A convenience function for creating /// TargetInstrInfo::INSERT_SUBREG nodes. SDValue getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT, SDValue Operand, SDValue Subreg); /// getNodeIfExists - Get the specified node if it's already available, or /// else return NULL. SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef Ops, bool nuw = false, bool nsw = false, bool exact = false); /// getDbgValue - Creates a SDDbgValue node. /// /// SDNode SDDbgValue *getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool IsIndirect, uint64_t Off, DebugLoc DL, unsigned O); /// Constant SDDbgValue *getConstantDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t Off, DebugLoc DL, unsigned O); /// FrameIndex SDDbgValue *getFrameIndexDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t Off, DebugLoc DL, unsigned O); /// RemoveDeadNode - Remove the specified node from the system. If any of its /// operands then becomes dead, remove them as well. Inform UpdateListener /// for each node deleted. void RemoveDeadNode(SDNode *N); /// RemoveDeadNodes - This method deletes the unreachable nodes in the /// given list, and any nodes that become unreachable as a result. void RemoveDeadNodes(SmallVectorImpl &DeadNodes); /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. Use the first /// version if 'From' is known to have a single result, use the second /// if you have two nodes with identical results (or if 'To' has a superset /// of the results of 'From'), use the third otherwise. /// /// These methods all take an optional UpdateListener, which (if not null) is /// informed about nodes that are deleted and modified due to recursive /// changes in the dag. /// /// These functions only replace all existing uses. It's possible that as /// these replacements are being performed, CSE may cause the From node /// to be given new uses. These new uses of From are left in place, and /// not automatically transferred to To. /// void ReplaceAllUsesWith(SDValue From, SDValue Op); void ReplaceAllUsesWith(SDNode *From, SDNode *To); void ReplaceAllUsesWith(SDNode *From, const SDValue *To); /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving /// uses of other values produced by From.Val alone. void ReplaceAllUsesOfValueWith(SDValue From, SDValue To); /// ReplaceAllUsesOfValuesWith - Like ReplaceAllUsesOfValueWith, but /// for multiple values at once. This correctly handles the case where /// there is an overlap between the From values and the To values. void ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To, unsigned Num); /// AssignTopologicalOrder - Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their /// topological order. Returns the number of nodes. unsigned AssignTopologicalOrder(); /// RepositionNode - Move node N in the AllNodes list to be immediately /// before the given iterator Position. This may be used to update the /// topological ordering when the list of nodes is modified. void RepositionNode(allnodes_iterator Position, SDNode *N) { AllNodes.insert(Position, AllNodes.remove(N)); } /// isCommutativeBinOp - Returns true if the opcode is a commutative binary /// operation. static bool isCommutativeBinOp(unsigned Opcode) { // FIXME: This should get its info from the td file, so that we can include // target info. switch (Opcode) { case ISD::ADD: case ISD::MUL: case ISD::MULHU: case ISD::MULHS: case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: case ISD::FADD: case ISD::FMUL: case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::SADDO: case ISD::UADDO: case ISD::ADDC: case ISD::ADDE: case ISD::FMINNUM: case ISD::FMAXNUM: return true; default: return false; } } /// Returns an APFloat semantics tag appropriate for the given type. If VT is /// a vector type, the element semantics are returned. static const fltSemantics &EVTToAPFloatSemantics(EVT VT) { switch (VT.getScalarType().getSimpleVT().SimpleTy) { default: llvm_unreachable("Unknown FP format"); case MVT::f16: return APFloat::IEEEhalf; case MVT::f32: return APFloat::IEEEsingle; case MVT::f64: return APFloat::IEEEdouble; case MVT::f80: return APFloat::x87DoubleExtended; case MVT::f128: return APFloat::IEEEquad; case MVT::ppcf128: return APFloat::PPCDoubleDouble; } } /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the /// value is produced by SD. void AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter); /// GetDbgValues - Get the debug values which reference the given SDNode. ArrayRef GetDbgValues(const SDNode* SD) { return DbgInfo->getSDDbgValues(SD); } /// TransferDbgValues - Transfer SDDbgValues. void TransferDbgValues(SDValue From, SDValue To); /// hasDebugValues - Return true if there are any SDDbgValue nodes associated /// with this SelectionDAG. bool hasDebugValues() const { return !DbgInfo->empty(); } SDDbgInfo::DbgIterator DbgBegin() { return DbgInfo->DbgBegin(); } SDDbgInfo::DbgIterator DbgEnd() { return DbgInfo->DbgEnd(); } SDDbgInfo::DbgIterator ByvalParmDbgBegin() { return DbgInfo->ByvalParmDbgBegin(); } SDDbgInfo::DbgIterator ByvalParmDbgEnd() { return DbgInfo->ByvalParmDbgEnd(); } void dump() const; /// CreateStackTemporary - Create a stack temporary, suitable for holding the /// specified value type. If minAlign is specified, the slot size will have /// at least that alignment. SDValue CreateStackTemporary(EVT VT, unsigned minAlign = 1); /// CreateStackTemporary - Create a stack temporary suitable for holding /// either of the specified value types. SDValue CreateStackTemporary(EVT VT1, EVT VT2); /// FoldConstantArithmetic - SDValue FoldConstantArithmetic(unsigned Opcode, EVT VT, SDNode *Cst1, SDNode *Cst2); /// FoldSetCC - Constant fold a setcc to true or false. SDValue FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond, SDLoc dl); /// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We /// use this predicate to simplify operations downstream. bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const; /// MaskedValueIsZero - Return true if 'Op & Mask' is known to be zero. We /// use this predicate to simplify operations downstream. Op and Mask are /// known to be the same type. bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth = 0) const; /// Determine which bits of Op are known to be either zero or one and return /// them in the KnownZero/KnownOne bitsets. Targets can implement the /// computeKnownBitsForTargetNode method in the TargetLowering class to allow /// target nodes to be understood. void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, unsigned Depth = 0) const; /// ComputeNumSignBits - Return the number of times the sign bit of the /// register is replicated into the other bits. We know that at least 1 bit /// is always equal to the sign bit (itself), but other cases can give us /// information. For example, immediately after an "SRA X, 2", we know that /// the top 3 bits are all equal to each other, so we return 3. Targets can /// implement the ComputeNumSignBitsForTarget method in the TargetLowering /// class to allow target nodes to be understood. unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const; /// isBaseWithConstantOffset - Return true if the specified operand is an /// ISD::ADD with a ConstantSDNode on the right-hand side, or if it is an /// ISD::OR with a ConstantSDNode that is guaranteed to have the same /// semantics as an ADD. This handles the equivalence: /// X|Cst == X+Cst iff X&Cst = 0. bool isBaseWithConstantOffset(SDValue Op) const; /// isKnownNeverNan - Test whether the given SDValue is known to never be NaN. bool isKnownNeverNaN(SDValue Op) const; /// isKnownNeverZero - Test whether the given SDValue is known to never be /// positive or negative Zero. bool isKnownNeverZero(SDValue Op) const; /// isEqualTo - Test whether two SDValues are known to compare equal. This /// is true if they are the same value, or if one is negative zero and the /// other positive zero. bool isEqualTo(SDValue A, SDValue B) const; /// UnrollVectorOp - Utility function used by legalize and lowering to /// "unroll" a vector operation by splitting out the scalars and operating /// on each element individually. If the ResNE is 0, fully unroll the vector /// op. If ResNE is less than the width of the vector op, unroll up to ResNE. /// If the ResNE is greater than the width of the vector op, unroll the /// vector op and fill the end of the resulting vector with UNDEFS. SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0); /// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a /// location that is 'Dist' units away from the location that the 'Base' load /// is loading from. bool isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const; /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if /// it cannot be inferred. unsigned InferPtrAlignment(SDValue Ptr) const; /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type /// which is split (or expanded) into two not necessarily identical pieces. std::pair GetSplitDestVTs(const EVT &VT) const; /// SplitVector - Split the vector with EXTRACT_SUBVECTOR using the provides /// VTs and return the low/high part. std::pair SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT); /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the /// low/high part. std::pair SplitVector(const SDValue &N, const SDLoc &DL) { EVT LoVT, HiVT; std::tie(LoVT, HiVT) = GetSplitDestVTs(N.getValueType()); return SplitVector(N, DL, LoVT, HiVT); } /// SplitVectorOperand - Split the node's operand with EXTRACT_SUBVECTOR and /// return the low/high part. std::pair SplitVectorOperand(const SDNode *N, unsigned OpNo) { return SplitVector(N->getOperand(OpNo), SDLoc(N)); } /// ExtractVectorElements - Append the extracted elements from Start to Count /// out of the vector Op in Args. If Count is 0, all of the elements will be /// extracted. void ExtractVectorElements(SDValue Op, SmallVectorImpl &Args, unsigned Start = 0, unsigned Count = 0); unsigned getEVTAlignment(EVT MemoryVT) const; private: void InsertNode(SDNode *N); bool RemoveNodeFromCSEMaps(SDNode *N); void AddModifiedNodeToCSEMaps(SDNode *N); SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos); SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2, void *&InsertPos); SDNode *FindModifiedNodeSlot(SDNode *N, ArrayRef Ops, void *&InsertPos); SDNode *UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc loc); void DeleteNodeNotInCSEMaps(SDNode *N); void DeallocateNode(SDNode *N); void allnodes_clear(); BinarySDNode *GetBinarySDNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2, bool nuw, bool nsw, bool exact); /// VTList - List of non-single value types. FoldingSet VTListMap; /// CondCodeNodes - Maps to auto-CSE operations. std::vector CondCodeNodes; std::vector ValueTypeNodes; std::map ExtendedValueTypeNodes; StringMap ExternalSymbols; std::map,SDNode*> TargetExternalSymbols; }; template <> struct GraphTraits : public GraphTraits { typedef SelectionDAG::allnodes_iterator nodes_iterator; static nodes_iterator nodes_begin(SelectionDAG *G) { return G->allnodes_begin(); } static nodes_iterator nodes_end(SelectionDAG *G) { return G->allnodes_end(); } }; } // end namespace llvm #endif Index: projects/clang360-import/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- projects/clang360-import/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h (revision 279024) +++ projects/clang360-import/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h (revision 279025) @@ -1,2186 +1,2195 @@ //===-- llvm/CodeGen/SelectionDAGNodes.h - SelectionDAG Nodes ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file declares the SDNode class and derived classes, which are used to // represent the nodes and operations present in a SelectionDAG. These nodes // and operations are machine code level operations, with some similarities to // the GCC RTL representation. // // Clients should include the SelectionDAG.h file instead of this file directly. // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_SELECTIONDAGNODES_H #define LLVM_CODEGEN_SELECTIONDAGNODES_H #include "llvm/ADT/BitVector.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/MathExtras.h" #include namespace llvm { class SelectionDAG; class GlobalValue; class MachineBasicBlock; class MachineConstantPoolValue; class SDNode; class Value; class MCSymbol; template struct DenseMapInfo; template struct simplify_type; template struct ilist_traits; /// isBinOpWithFlags - Returns true if the opcode is a binary operation /// with flags. static bool isBinOpWithFlags(unsigned Opcode) { switch (Opcode) { case ISD::SDIV: case ISD::UDIV: case ISD::SRA: case ISD::SRL: case ISD::MUL: case ISD::ADD: case ISD::SUB: case ISD::SHL: return true; default: return false; } } void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr, bool force = false); /// SDVTList - This represents a list of ValueType's that has been intern'd by /// a SelectionDAG. Instances of this simple value class are returned by /// SelectionDAG::getVTList(...). /// struct SDVTList { const EVT *VTs; unsigned int NumVTs; }; namespace ISD { /// Node predicates /// isBuildVectorAllOnes - Return true if the specified node is a /// BUILD_VECTOR where all of the elements are ~0 or undef. bool isBuildVectorAllOnes(const SDNode *N); /// isBuildVectorAllZeros - Return true if the specified node is a /// BUILD_VECTOR where all of the elements are 0 or undef. bool isBuildVectorAllZeros(const SDNode *N); /// \brief Return true if the specified node is a BUILD_VECTOR node of /// all ConstantSDNode or undef. bool isBuildVectorOfConstantSDNodes(const SDNode *N); /// isScalarToVector - Return true if the specified node is a /// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low /// element is not an undef. bool isScalarToVector(const SDNode *N); /// allOperandsUndef - Return true if the node has at least one operand /// and all operands of the specified node are ISD::UNDEF. bool allOperandsUndef(const SDNode *N); } // end llvm:ISD namespace //===----------------------------------------------------------------------===// /// SDValue - Unlike LLVM values, Selection DAG nodes may return multiple /// values as the result of a computation. Many nodes return multiple values, /// from loads (which define a token and a return value) to ADDC (which returns /// a result and a carry value), to calls (which may return an arbitrary number /// of values). /// /// As such, each use of a SelectionDAG computation must indicate the node that /// computes it as well as which return value to use from that node. This pair /// of information is represented with the SDValue value type. /// class SDValue { friend struct DenseMapInfo; SDNode *Node; // The node defining the value we are using. unsigned ResNo; // Which return value of the node we are using. public: SDValue() : Node(nullptr), ResNo(0) {} SDValue(SDNode *node, unsigned resno); /// get the index which selects a specific result in the SDNode unsigned getResNo() const { return ResNo; } /// get the SDNode which holds the desired result SDNode *getNode() const { return Node; } /// set the SDNode void setNode(SDNode *N) { Node = N; } inline SDNode *operator->() const { return Node; } bool operator==(const SDValue &O) const { return Node == O.Node && ResNo == O.ResNo; } bool operator!=(const SDValue &O) const { return !operator==(O); } bool operator<(const SDValue &O) const { return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo); } LLVM_EXPLICIT operator bool() const { return Node != nullptr; } SDValue getValue(unsigned R) const { return SDValue(Node, R); } // isOperandOf - Return true if this node is an operand of N. bool isOperandOf(SDNode *N) const; /// getValueType - Return the ValueType of the referenced return value. /// inline EVT getValueType() const; /// Return the simple ValueType of the referenced return value. MVT getSimpleValueType() const { return getValueType().getSimpleVT(); } /// getValueSizeInBits - Returns the size of the value in bits. /// unsigned getValueSizeInBits() const { return getValueType().getSizeInBits(); } unsigned getScalarValueSizeInBits() const { return getValueType().getScalarType().getSizeInBits(); } // Forwarding methods - These forward to the corresponding methods in SDNode. inline unsigned getOpcode() const; inline unsigned getNumOperands() const; inline const SDValue &getOperand(unsigned i) const; inline uint64_t getConstantOperandVal(unsigned i) const; inline bool isTargetMemoryOpcode() const; inline bool isTargetOpcode() const; inline bool isMachineOpcode() const; inline unsigned getMachineOpcode() const; inline const DebugLoc getDebugLoc() const; inline void dump() const; inline void dumpr() const; /// reachesChainWithoutSideEffects - Return true if this operand (which must /// be a chain) reaches the specified operand without crossing any /// side-effecting instructions. In practice, this looks through token /// factors and non-volatile loads. In order to remain efficient, this only /// looks a couple of nodes in, it does not do an exhaustive search. bool reachesChainWithoutSideEffects(SDValue Dest, unsigned Depth = 2) const; /// use_empty - Return true if there are no nodes using value ResNo /// of Node. /// inline bool use_empty() const; /// hasOneUse - Return true if there is exactly one node using value /// ResNo of Node. /// inline bool hasOneUse() const; }; template<> struct DenseMapInfo { static inline SDValue getEmptyKey() { SDValue V; V.ResNo = -1U; return V; } static inline SDValue getTombstoneKey() { SDValue V; V.ResNo = -2U; return V; } static unsigned getHashValue(const SDValue &Val) { return ((unsigned)((uintptr_t)Val.getNode() >> 4) ^ (unsigned)((uintptr_t)Val.getNode() >> 9)) + Val.getResNo(); } static bool isEqual(const SDValue &LHS, const SDValue &RHS) { return LHS == RHS; } }; template <> struct isPodLike { static const bool value = true; }; /// simplify_type specializations - Allow casting operators to work directly on /// SDValues as if they were SDNode*'s. template<> struct simplify_type { typedef SDNode* SimpleType; static SimpleType getSimplifiedValue(SDValue &Val) { return Val.getNode(); } }; template<> struct simplify_type { typedef /*const*/ SDNode* SimpleType; static SimpleType getSimplifiedValue(const SDValue &Val) { return Val.getNode(); } }; /// SDUse - Represents a use of a SDNode. This class holds an SDValue, /// which records the SDNode being used and the result number, a /// pointer to the SDNode using the value, and Next and Prev pointers, /// which link together all the uses of an SDNode. /// class SDUse { /// Val - The value being used. SDValue Val; /// User - The user of this value. SDNode *User; /// Prev, Next - Pointers to the uses list of the SDNode referred by /// this operand. SDUse **Prev, *Next; SDUse(const SDUse &U) LLVM_DELETED_FUNCTION; void operator=(const SDUse &U) LLVM_DELETED_FUNCTION; public: SDUse() : Val(), User(nullptr), Prev(nullptr), Next(nullptr) {} /// Normally SDUse will just implicitly convert to an SDValue that it holds. operator const SDValue&() const { return Val; } /// If implicit conversion to SDValue doesn't work, the get() method returns /// the SDValue. const SDValue &get() const { return Val; } /// getUser - This returns the SDNode that contains this Use. SDNode *getUser() { return User; } /// getNext - Get the next SDUse in the use list. SDUse *getNext() const { return Next; } /// getNode - Convenience function for get().getNode(). SDNode *getNode() const { return Val.getNode(); } /// getResNo - Convenience function for get().getResNo(). unsigned getResNo() const { return Val.getResNo(); } /// getValueType - Convenience function for get().getValueType(). EVT getValueType() const { return Val.getValueType(); } /// operator== - Convenience function for get().operator== bool operator==(const SDValue &V) const { return Val == V; } /// operator!= - Convenience function for get().operator!= bool operator!=(const SDValue &V) const { return Val != V; } /// operator< - Convenience function for get().operator< bool operator<(const SDValue &V) const { return Val < V; } private: friend class SelectionDAG; friend class SDNode; void setUser(SDNode *p) { User = p; } /// set - Remove this use from its existing use list, assign it the /// given value, and add it to the new value's node's use list. inline void set(const SDValue &V); /// setInitial - like set, but only supports initializing a newly-allocated /// SDUse with a non-null value. inline void setInitial(const SDValue &V); /// setNode - like set, but only sets the Node portion of the value, /// leaving the ResNo portion unmodified. inline void setNode(SDNode *N); void addToList(SDUse **List) { Next = *List; if (Next) Next->Prev = &Next; Prev = List; *List = this; } void removeFromList() { *Prev = Next; if (Next) Next->Prev = Prev; } }; /// simplify_type specializations - Allow casting operators to work directly on /// SDValues as if they were SDNode*'s. template<> struct simplify_type { typedef SDNode* SimpleType; static SimpleType getSimplifiedValue(SDUse &Val) { return Val.getNode(); } }; /// SDNode - Represents one node in the SelectionDAG. /// class SDNode : public FoldingSetNode, public ilist_node { private: /// NodeType - The operation that this node performs. /// int16_t NodeType; /// OperandsNeedDelete - This is true if OperandList was new[]'d. If true, /// then they will be delete[]'d when the node is destroyed. uint16_t OperandsNeedDelete : 1; /// HasDebugValue - This tracks whether this node has one or more dbg_value /// nodes corresponding to it. uint16_t HasDebugValue : 1; protected: /// SubclassData - This member is defined by this class, but is not used for /// anything. Subclasses can use it to hold whatever state they find useful. /// This field is initialized to zero by the ctor. uint16_t SubclassData : 14; private: /// NodeId - Unique id per SDNode in the DAG. int NodeId; /// OperandList - The values that are used by this operation. /// SDUse *OperandList; /// ValueList - The types of the values this node defines. SDNode's may /// define multiple values simultaneously. const EVT *ValueList; /// UseList - List of uses for this SDNode. SDUse *UseList; /// NumOperands/NumValues - The number of entries in the Operand/Value list. unsigned short NumOperands, NumValues; /// debugLoc - source line information. DebugLoc debugLoc; // The ordering of the SDNodes. It roughly corresponds to the ordering of the // original LLVM instructions. // This is used for turning off scheduling, because we'll forgo // the normal scheduling algorithms and output the instructions according to // this ordering. unsigned IROrder; /// getValueTypeList - Return a pointer to the specified value type. static const EVT *getValueTypeList(EVT VT); friend class SelectionDAG; friend struct ilist_traits; public: //===--------------------------------------------------------------------===// // Accessors // /// getOpcode - Return the SelectionDAG opcode value for this node. For /// pre-isel nodes (those for which isMachineOpcode returns false), these /// are the opcode values in the ISD and ISD namespaces. For /// post-isel opcodes, see getMachineOpcode. unsigned getOpcode() const { return (unsigned short)NodeType; } /// isTargetOpcode - Test if this node has a target-specific opcode (in the /// \ISD namespace). bool isTargetOpcode() const { return NodeType >= ISD::BUILTIN_OP_END; } /// isTargetMemoryOpcode - Test if this node has a target-specific /// memory-referencing opcode (in the \ISD namespace and /// greater than FIRST_TARGET_MEMORY_OPCODE). bool isTargetMemoryOpcode() const { return NodeType >= ISD::FIRST_TARGET_MEMORY_OPCODE; } /// Test if this node is a memory intrinsic (with valid pointer information). /// INTRINSIC_W_CHAIN and INTRINSIC_VOID nodes are sometimes created for /// non-memory intrinsics (with chains) that are not really instances of /// MemSDNode. For such nodes, we need some extra state to determine the /// proper classof relationship. bool isMemIntrinsic() const { return (NodeType == ISD::INTRINSIC_W_CHAIN || NodeType == ISD::INTRINSIC_VOID) && ((SubclassData >> 13) & 1); } /// isMachineOpcode - Test if this node has a post-isel opcode, directly /// corresponding to a MachineInstr opcode. bool isMachineOpcode() const { return NodeType < 0; } /// getMachineOpcode - This may only be called if isMachineOpcode returns /// true. It returns the MachineInstr opcode value that the node's opcode /// corresponds to. unsigned getMachineOpcode() const { assert(isMachineOpcode() && "Not a MachineInstr opcode!"); return ~NodeType; } /// getHasDebugValue - get this bit. bool getHasDebugValue() const { return HasDebugValue; } /// setHasDebugValue - set this bit. void setHasDebugValue(bool b) { HasDebugValue = b; } /// use_empty - Return true if there are no uses of this node. /// bool use_empty() const { return UseList == nullptr; } /// hasOneUse - Return true if there is exactly one use of this node. /// bool hasOneUse() const { return !use_empty() && std::next(use_begin()) == use_end(); } /// use_size - Return the number of uses of this node. This method takes /// time proportional to the number of uses. /// size_t use_size() const { return std::distance(use_begin(), use_end()); } /// getNodeId - Return the unique node id. /// int getNodeId() const { return NodeId; } /// setNodeId - Set unique node id. void setNodeId(int Id) { NodeId = Id; } /// getIROrder - Return the node ordering. /// unsigned getIROrder() const { return IROrder; } /// setIROrder - Set the node ordering. /// void setIROrder(unsigned Order) { IROrder = Order; } /// getDebugLoc - Return the source location info. const DebugLoc getDebugLoc() const { return debugLoc; } /// setDebugLoc - Set source location info. Try to avoid this, putting /// it in the constructor is preferable. void setDebugLoc(const DebugLoc dl) { debugLoc = dl; } /// use_iterator - This class provides iterator support for SDUse /// operands that use a specific SDNode. class use_iterator : public std::iterator { SDUse *Op; explicit use_iterator(SDUse *op) : Op(op) { } friend class SDNode; public: typedef std::iterator::reference reference; typedef std::iterator::pointer pointer; use_iterator(const use_iterator &I) : Op(I.Op) {} use_iterator() : Op(nullptr) {} bool operator==(const use_iterator &x) const { return Op == x.Op; } bool operator!=(const use_iterator &x) const { return !operator==(x); } /// atEnd - return true if this iterator is at the end of uses list. bool atEnd() const { return Op == nullptr; } // Iterator traversal: forward iteration only. use_iterator &operator++() { // Preincrement assert(Op && "Cannot increment end iterator!"); Op = Op->getNext(); return *this; } use_iterator operator++(int) { // Postincrement use_iterator tmp = *this; ++*this; return tmp; } /// Retrieve a pointer to the current user node. SDNode *operator*() const { assert(Op && "Cannot dereference end iterator!"); return Op->getUser(); } SDNode *operator->() const { return operator*(); } SDUse &getUse() const { return *Op; } /// getOperandNo - Retrieve the operand # of this use in its user. /// unsigned getOperandNo() const { assert(Op && "Cannot dereference end iterator!"); return (unsigned)(Op - Op->getUser()->OperandList); } }; /// use_begin/use_end - Provide iteration support to walk over all uses /// of an SDNode. use_iterator use_begin() const { return use_iterator(UseList); } static use_iterator use_end() { return use_iterator(nullptr); } inline iterator_range uses() { return iterator_range(use_begin(), use_end()); } inline iterator_range uses() const { return iterator_range(use_begin(), use_end()); } /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the /// indicated value. This method ignores uses of other values defined by this /// operation. bool hasNUsesOfValue(unsigned NUses, unsigned Value) const; /// hasAnyUseOfValue - Return true if there are any use of the indicated /// value. This method ignores uses of other values defined by this operation. bool hasAnyUseOfValue(unsigned Value) const; /// isOnlyUserOf - Return true if this node is the only use of N. /// bool isOnlyUserOf(SDNode *N) const; /// isOperandOf - Return true if this node is an operand of N. /// bool isOperandOf(SDNode *N) const; /// isPredecessorOf - Return true if this node is a predecessor of N. /// NOTE: Implemented on top of hasPredecessor and every bit as /// expensive. Use carefully. bool isPredecessorOf(const SDNode *N) const { return N->hasPredecessor(this); } /// hasPredecessor - Return true if N is a predecessor of this node. /// N is either an operand of this node, or can be reached by recursively /// traversing up the operands. /// NOTE: This is an expensive method. Use it carefully. bool hasPredecessor(const SDNode *N) const; /// hasPredecesorHelper - Return true if N is a predecessor of this node. /// N is either an operand of this node, or can be reached by recursively /// traversing up the operands. /// In this helper the Visited and worklist sets are held externally to /// cache predecessors over multiple invocations. If you want to test for /// multiple predecessors this method is preferable to multiple calls to /// hasPredecessor. Be sure to clear Visited and Worklist if the DAG /// changes. /// NOTE: This is still very expensive. Use carefully. bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallVectorImpl &Worklist) const; /// getNumOperands - Return the number of values used by this operation. /// unsigned getNumOperands() const { return NumOperands; } /// getConstantOperandVal - Helper method returns the integer value of a /// ConstantSDNode operand. uint64_t getConstantOperandVal(unsigned Num) const; const SDValue &getOperand(unsigned Num) const { assert(Num < NumOperands && "Invalid child # of SDNode!"); return OperandList[Num]; } typedef SDUse* op_iterator; op_iterator op_begin() const { return OperandList; } op_iterator op_end() const { return OperandList+NumOperands; } ArrayRef ops() const { return makeArrayRef(op_begin(), op_end()); } SDVTList getVTList() const { SDVTList X = { ValueList, NumValues }; return X; } /// getGluedNode - If this node has a glue operand, return the node /// to which the glue operand points. Otherwise return NULL. SDNode *getGluedNode() const { if (getNumOperands() != 0 && getOperand(getNumOperands()-1).getValueType() == MVT::Glue) return getOperand(getNumOperands()-1).getNode(); return nullptr; } // If this is a pseudo op, like copyfromreg, look to see if there is a // real target node glued to it. If so, return the target node. const SDNode *getGluedMachineNode() const { const SDNode *FoundNode = this; // Climb up glue edges until a machine-opcode node is found, or the // end of the chain is reached. while (!FoundNode->isMachineOpcode()) { const SDNode *N = FoundNode->getGluedNode(); if (!N) break; FoundNode = N; } return FoundNode; } /// getGluedUser - If this node has a glue value with a user, return /// the user (there is at most one). Otherwise return NULL. SDNode *getGluedUser() const { for (use_iterator UI = use_begin(), UE = use_end(); UI != UE; ++UI) if (UI.getUse().get().getValueType() == MVT::Glue) return *UI; return nullptr; } /// getNumValues - Return the number of values defined/returned by this /// operator. /// unsigned getNumValues() const { return NumValues; } /// getValueType - Return the type of a specified result. /// EVT getValueType(unsigned ResNo) const { assert(ResNo < NumValues && "Illegal result number!"); return ValueList[ResNo]; } /// Return the type of a specified result as a simple type. /// MVT getSimpleValueType(unsigned ResNo) const { return getValueType(ResNo).getSimpleVT(); } /// getValueSizeInBits - Returns MVT::getSizeInBits(getValueType(ResNo)). /// unsigned getValueSizeInBits(unsigned ResNo) const { return getValueType(ResNo).getSizeInBits(); } typedef const EVT* value_iterator; value_iterator value_begin() const { return ValueList; } value_iterator value_end() const { return ValueList+NumValues; } /// getOperationName - Return the opcode of this operation for printing. /// std::string getOperationName(const SelectionDAG *G = nullptr) const; static const char* getIndexedModeName(ISD::MemIndexedMode AM); void print_types(raw_ostream &OS, const SelectionDAG *G) const; void print_details(raw_ostream &OS, const SelectionDAG *G) const; void print(raw_ostream &OS, const SelectionDAG *G = nullptr) const; void printr(raw_ostream &OS, const SelectionDAG *G = nullptr) const; /// printrFull - Print a SelectionDAG node and all children down to /// the leaves. The given SelectionDAG allows target-specific nodes /// to be printed in human-readable form. Unlike printr, this will /// print the whole DAG, including children that appear multiple /// times. /// void printrFull(raw_ostream &O, const SelectionDAG *G = nullptr) const; /// printrWithDepth - Print a SelectionDAG node and children up to /// depth "depth." The given SelectionDAG allows target-specific /// nodes to be printed in human-readable form. Unlike printr, this /// will print children that appear multiple times wherever they are /// used. /// void printrWithDepth(raw_ostream &O, const SelectionDAG *G = nullptr, unsigned depth = 100) const; /// dump - Dump this node, for debugging. void dump() const; /// dumpr - Dump (recursively) this node and its use-def subgraph. void dumpr() const; /// dump - Dump this node, for debugging. /// The given SelectionDAG allows target-specific nodes to be printed /// in human-readable form. void dump(const SelectionDAG *G) const; /// dumpr - Dump (recursively) this node and its use-def subgraph. /// The given SelectionDAG allows target-specific nodes to be printed /// in human-readable form. void dumpr(const SelectionDAG *G) const; /// dumprFull - printrFull to dbgs(). The given SelectionDAG allows /// target-specific nodes to be printed in human-readable form. /// Unlike dumpr, this will print the whole DAG, including children /// that appear multiple times. /// void dumprFull(const SelectionDAG *G = nullptr) const; /// dumprWithDepth - printrWithDepth to dbgs(). The given /// SelectionDAG allows target-specific nodes to be printed in /// human-readable form. Unlike dumpr, this will print children /// that appear multiple times wherever they are used. /// void dumprWithDepth(const SelectionDAG *G = nullptr, unsigned depth = 100) const; /// Profile - Gather unique data for the node. /// void Profile(FoldingSetNodeID &ID) const; /// addUse - This method should only be used by the SDUse class. /// void addUse(SDUse &U) { U.addToList(&UseList); } protected: static SDVTList getSDVTList(EVT VT) { SDVTList Ret = { getValueTypeList(VT), 1 }; return Ret; } SDNode(unsigned Opc, unsigned Order, const DebugLoc dl, SDVTList VTs, ArrayRef Ops) : NodeType(Opc), OperandsNeedDelete(true), HasDebugValue(false), SubclassData(0), NodeId(-1), OperandList(Ops.size() ? new SDUse[Ops.size()] : nullptr), ValueList(VTs.VTs), UseList(nullptr), NumOperands(Ops.size()), NumValues(VTs.NumVTs), debugLoc(dl), IROrder(Order) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); assert(NumOperands == Ops.size() && "NumOperands wasn't wide enough for its operands!"); assert(NumValues == VTs.NumVTs && "NumValues wasn't wide enough for its operands!"); for (unsigned i = 0; i != Ops.size(); ++i) { assert(OperandList && "no operands available"); OperandList[i].setUser(this); OperandList[i].setInitial(Ops[i]); } checkForCycles(this); } /// This constructor adds no operands itself; operands can be /// set later with InitOperands. SDNode(unsigned Opc, unsigned Order, const DebugLoc dl, SDVTList VTs) : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false), SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs), UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs), debugLoc(dl), IROrder(Order) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); assert(NumValues == VTs.NumVTs && "NumValues wasn't wide enough for its operands!"); } /// InitOperands - Initialize the operands list of this with 1 operand. void InitOperands(SDUse *Ops, const SDValue &Op0) { Ops[0].setUser(this); Ops[0].setInitial(Op0); NumOperands = 1; OperandList = Ops; checkForCycles(this); } /// InitOperands - Initialize the operands list of this with 2 operands. void InitOperands(SDUse *Ops, const SDValue &Op0, const SDValue &Op1) { Ops[0].setUser(this); Ops[0].setInitial(Op0); Ops[1].setUser(this); Ops[1].setInitial(Op1); NumOperands = 2; OperandList = Ops; checkForCycles(this); } /// InitOperands - Initialize the operands list of this with 3 operands. void InitOperands(SDUse *Ops, const SDValue &Op0, const SDValue &Op1, const SDValue &Op2) { Ops[0].setUser(this); Ops[0].setInitial(Op0); Ops[1].setUser(this); Ops[1].setInitial(Op1); Ops[2].setUser(this); Ops[2].setInitial(Op2); NumOperands = 3; OperandList = Ops; checkForCycles(this); } /// InitOperands - Initialize the operands list of this with 4 operands. void InitOperands(SDUse *Ops, const SDValue &Op0, const SDValue &Op1, const SDValue &Op2, const SDValue &Op3) { Ops[0].setUser(this); Ops[0].setInitial(Op0); Ops[1].setUser(this); Ops[1].setInitial(Op1); Ops[2].setUser(this); Ops[2].setInitial(Op2); Ops[3].setUser(this); Ops[3].setInitial(Op3); NumOperands = 4; OperandList = Ops; checkForCycles(this); } /// InitOperands - Initialize the operands list of this with N operands. void InitOperands(SDUse *Ops, const SDValue *Vals, unsigned N) { for (unsigned i = 0; i != N; ++i) { Ops[i].setUser(this); Ops[i].setInitial(Vals[i]); } NumOperands = N; assert(NumOperands == N && "NumOperands wasn't wide enough for its operands!"); OperandList = Ops; checkForCycles(this); } /// DropOperands - Release the operands and set this node to have /// zero operands. void DropOperands(); }; /// Wrapper class for IR location info (IR ordering and DebugLoc) to be passed /// into SDNode creation functions. /// When an SDNode is created from the DAGBuilder, the DebugLoc is extracted /// from the original Instruction, and IROrder is the ordinal position of /// the instruction. /// When an SDNode is created after the DAG is being built, both DebugLoc and /// the IROrder are propagated from the original SDNode. /// So SDLoc class provides two constructors besides the default one, one to /// be used by the DAGBuilder, the other to be used by others. class SDLoc { private: // Ptr could be used for either Instruction* or SDNode*. It is used for // Instruction* if IROrder is not -1. const void *Ptr; int IROrder; public: SDLoc() : Ptr(nullptr), IROrder(0) {} SDLoc(const SDNode *N) : Ptr(N), IROrder(-1) { assert(N && "null SDNode"); } SDLoc(const SDValue V) : Ptr(V.getNode()), IROrder(-1) { assert(Ptr && "null SDNode"); } SDLoc(const Instruction *I, int Order) : Ptr(I), IROrder(Order) { assert(Order >= 0 && "bad IROrder"); } unsigned getIROrder() { if (IROrder >= 0 || Ptr == nullptr) { return (unsigned)IROrder; } const SDNode *N = (const SDNode*)(Ptr); return N->getIROrder(); } DebugLoc getDebugLoc() { if (!Ptr) { return DebugLoc(); } if (IROrder >= 0) { const Instruction *I = (const Instruction*)(Ptr); return I->getDebugLoc(); } const SDNode *N = (const SDNode*)(Ptr); return N->getDebugLoc(); } }; // Define inline functions from the SDValue class. inline SDValue::SDValue(SDNode *node, unsigned resno) : Node(node), ResNo(resno) { assert((!Node || ResNo < Node->getNumValues()) && "Invalid result number for the given node!"); assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps."); } inline unsigned SDValue::getOpcode() const { return Node->getOpcode(); } inline EVT SDValue::getValueType() const { return Node->getValueType(ResNo); } inline unsigned SDValue::getNumOperands() const { return Node->getNumOperands(); } inline const SDValue &SDValue::getOperand(unsigned i) const { return Node->getOperand(i); } inline uint64_t SDValue::getConstantOperandVal(unsigned i) const { return Node->getConstantOperandVal(i); } inline bool SDValue::isTargetOpcode() const { return Node->isTargetOpcode(); } inline bool SDValue::isTargetMemoryOpcode() const { return Node->isTargetMemoryOpcode(); } inline bool SDValue::isMachineOpcode() const { return Node->isMachineOpcode(); } inline unsigned SDValue::getMachineOpcode() const { return Node->getMachineOpcode(); } inline bool SDValue::use_empty() const { return !Node->hasAnyUseOfValue(ResNo); } inline bool SDValue::hasOneUse() const { return Node->hasNUsesOfValue(1, ResNo); } inline const DebugLoc SDValue::getDebugLoc() const { return Node->getDebugLoc(); } inline void SDValue::dump() const { return Node->dump(); } inline void SDValue::dumpr() const { return Node->dumpr(); } // Define inline functions from the SDUse class. inline void SDUse::set(const SDValue &V) { if (Val.getNode()) removeFromList(); Val = V; if (V.getNode()) V.getNode()->addUse(*this); } inline void SDUse::setInitial(const SDValue &V) { Val = V; V.getNode()->addUse(*this); } inline void SDUse::setNode(SDNode *N) { if (Val.getNode()) removeFromList(); Val.setNode(N); if (N) N->addUse(*this); } /// UnarySDNode - This class is used for single-operand SDNodes. This is solely /// to allow co-allocation of node operands with the node itself. class UnarySDNode : public SDNode { SDUse Op; public: UnarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X) : SDNode(Opc, Order, dl, VTs) { InitOperands(&Op, X); } }; /// BinarySDNode - This class is used for two-operand SDNodes. This is solely /// to allow co-allocation of node operands with the node itself. class BinarySDNode : public SDNode { SDUse Ops[2]; public: BinarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y) : SDNode(Opc, Order, dl, VTs) { InitOperands(Ops, X, Y); } }; /// BinaryWithFlagsSDNode - This class is an extension of BinarySDNode /// used from those opcodes that have associated extra flags. class BinaryWithFlagsSDNode : public BinarySDNode { enum { NUW = (1 << 0), NSW = (1 << 1), EXACT = (1 << 2) }; public: BinaryWithFlagsSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y) : BinarySDNode(Opc, Order, dl, VTs, X, Y) {} /// getRawSubclassData - Return the SubclassData value, which contains an /// encoding of the flags. /// This function should be used to add subclass data to the NodeID value. unsigned getRawSubclassData() const { return SubclassData; } void setHasNoUnsignedWrap(bool b) { SubclassData = (SubclassData & ~NUW) | (b ? NUW : 0); } void setHasNoSignedWrap(bool b) { SubclassData = (SubclassData & ~NSW) | (b ? NSW : 0); } void setIsExact(bool b) { SubclassData = (SubclassData & ~EXACT) | (b ? EXACT : 0); } bool hasNoUnsignedWrap() const { return SubclassData & NUW; } bool hasNoSignedWrap() const { return SubclassData & NSW; } bool isExact() const { return SubclassData & EXACT; } static bool classof(const SDNode *N) { return isBinOpWithFlags(N->getOpcode()); } }; /// TernarySDNode - This class is used for three-operand SDNodes. This is solely /// to allow co-allocation of node operands with the node itself. class TernarySDNode : public SDNode { SDUse Ops[3]; public: TernarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y, SDValue Z) : SDNode(Opc, Order, dl, VTs) { InitOperands(Ops, X, Y, Z); } }; /// HandleSDNode - This class is used to form a handle around another node that /// is persistent and is updated across invocations of replaceAllUsesWith on its /// operand. This node should be directly created by end-users and not added to /// the AllNodes list. class HandleSDNode : public SDNode { SDUse Op; public: explicit HandleSDNode(SDValue X) : SDNode(ISD::HANDLENODE, 0, DebugLoc(), getSDVTList(MVT::Other)) { InitOperands(&Op, X); } ~HandleSDNode(); const SDValue &getValue() const { return Op; } }; class AddrSpaceCastSDNode : public UnarySDNode { private: unsigned SrcAddrSpace; unsigned DestAddrSpace; public: AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT, SDValue X, unsigned SrcAS, unsigned DestAS); unsigned getSrcAddressSpace() const { return SrcAddrSpace; } unsigned getDestAddressSpace() const { return DestAddrSpace; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ADDRSPACECAST; } }; /// Abstact virtual class for operations for memory operations class MemSDNode : public SDNode { private: // MemoryVT - VT of in-memory value. EVT MemoryVT; protected: /// MMO - Memory reference information. MachineMemOperand *MMO; public: MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, EVT MemoryVT, MachineMemOperand *MMO); MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops, EVT MemoryVT, MachineMemOperand *MMO); bool readMem() const { return MMO->isLoad(); } bool writeMem() const { return MMO->isStore(); } /// Returns alignment and volatility of the memory access unsigned getOriginalAlignment() const { return MMO->getBaseAlignment(); } unsigned getAlignment() const { return MMO->getAlignment(); } /// getRawSubclassData - Return the SubclassData value, which contains an /// encoding of the volatile flag, as well as bits used by subclasses. This /// function should only be used to compute a FoldingSetNodeID value. unsigned getRawSubclassData() const { return SubclassData; } // We access subclass data here so that we can check consistency // with MachineMemOperand information. bool isVolatile() const { return (SubclassData >> 5) & 1; } bool isNonTemporal() const { return (SubclassData >> 6) & 1; } bool isInvariant() const { return (SubclassData >> 7) & 1; } AtomicOrdering getOrdering() const { return AtomicOrdering((SubclassData >> 8) & 15); } SynchronizationScope getSynchScope() const { return SynchronizationScope((SubclassData >> 12) & 1); } // Returns the offset from the location of the access. int64_t getSrcValueOffset() const { return MMO->getOffset(); } /// Returns the AA info that describes the dereference. AAMDNodes getAAInfo() const { return MMO->getAAInfo(); } /// Returns the Ranges that describes the dereference. const MDNode *getRanges() const { return MMO->getRanges(); } /// getMemoryVT - Return the type of the in-memory value. EVT getMemoryVT() const { return MemoryVT; } /// getMemOperand - Return a MachineMemOperand object describing the memory /// reference performed by operation. MachineMemOperand *getMemOperand() const { return MMO; } const MachinePointerInfo &getPointerInfo() const { return MMO->getPointerInfo(); } /// getAddressSpace - Return the address space for the associated pointer unsigned getAddressSpace() const { return getPointerInfo().getAddrSpace(); } /// refineAlignment - Update this MemSDNode's MachineMemOperand information /// to reflect the alignment of NewMMO, if it has a greater alignment. /// This must only be used when the new alignment applies to all users of /// this MachineMemOperand. void refineAlignment(const MachineMemOperand *NewMMO) { MMO->refineAlignment(NewMMO); } const SDValue &getChain() const { return getOperand(0); } const SDValue &getBasePtr() const { return getOperand(getOpcode() == ISD::STORE ? 2 : 1); } // Methods to support isa and dyn_cast static bool classof(const SDNode *N) { // For some targets, we lower some target intrinsics to a MemIntrinsicNode // with either an intrinsic or a target opcode. return N->getOpcode() == ISD::LOAD || N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::PREFETCH || N->getOpcode() == ISD::ATOMIC_CMP_SWAP || N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS || N->getOpcode() == ISD::ATOMIC_SWAP || N->getOpcode() == ISD::ATOMIC_LOAD_ADD || N->getOpcode() == ISD::ATOMIC_LOAD_SUB || N->getOpcode() == ISD::ATOMIC_LOAD_AND || N->getOpcode() == ISD::ATOMIC_LOAD_OR || N->getOpcode() == ISD::ATOMIC_LOAD_XOR || N->getOpcode() == ISD::ATOMIC_LOAD_NAND || N->getOpcode() == ISD::ATOMIC_LOAD_MIN || N->getOpcode() == ISD::ATOMIC_LOAD_MAX || N->getOpcode() == ISD::ATOMIC_LOAD_UMIN || N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE || N->getOpcode() == ISD::MLOAD || N->getOpcode() == ISD::MSTORE || N->isMemIntrinsic() || N->isTargetMemoryOpcode(); } }; /// AtomicSDNode - A SDNode reprenting atomic operations. /// class AtomicSDNode : public MemSDNode { SDUse Ops[4]; /// For cmpxchg instructions, the ordering requirements when a store does not /// occur. AtomicOrdering FailureOrdering; void InitAtomic(AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { // This must match encodeMemSDNodeFlags() in SelectionDAG.cpp. assert((SuccessOrdering & 15) == SuccessOrdering && "Ordering may not require more than 4 bits!"); assert((FailureOrdering & 15) == FailureOrdering && "Ordering may not require more than 4 bits!"); assert((SynchScope & 1) == SynchScope && "SynchScope may not require more than 1 bit!"); SubclassData |= SuccessOrdering << 8; SubclassData |= SynchScope << 12; this->FailureOrdering = FailureOrdering; assert(getSuccessOrdering() == SuccessOrdering && "Ordering encoding error!"); assert(getFailureOrdering() == FailureOrdering && "Ordering encoding error!"); assert(getSynchScope() == SynchScope && "Synch-scope encoding error!"); } public: // Opc: opcode for atomic // VTL: value type list // Chain: memory chain for operaand // Ptr: address to update as a SDValue // Cmp: compare value // Swp: swap value // SrcVal: address to update as a Value (used for MemOperand) // Align: alignment of memory AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(Ordering, Ordering, SynchScope); InitOperands(Ops, Chain, Ptr, Cmp, Swp); } AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(Ordering, Ordering, SynchScope); InitOperands(Ops, Chain, Ptr, Val); } AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(Ordering, Ordering, SynchScope); InitOperands(Ops, Chain, Ptr); } AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, const SDValue* AllOps, SDUse *DynOps, unsigned NumOps, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(SuccessOrdering, FailureOrdering, SynchScope); assert((DynOps || NumOps <= array_lengthof(Ops)) && "Too many ops for internal storage!"); InitOperands(DynOps ? DynOps : Ops, AllOps, NumOps); } const SDValue &getBasePtr() const { return getOperand(1); } const SDValue &getVal() const { return getOperand(2); } AtomicOrdering getSuccessOrdering() const { return getOrdering(); } // Not quite enough room in SubclassData for everything, so failure gets its // own field. AtomicOrdering getFailureOrdering() const { return FailureOrdering; } bool isCompareAndSwap() const { unsigned Op = getOpcode(); return Op == ISD::ATOMIC_CMP_SWAP || Op == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS; } // Methods to support isa and dyn_cast static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ATOMIC_CMP_SWAP || N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS || N->getOpcode() == ISD::ATOMIC_SWAP || N->getOpcode() == ISD::ATOMIC_LOAD_ADD || N->getOpcode() == ISD::ATOMIC_LOAD_SUB || N->getOpcode() == ISD::ATOMIC_LOAD_AND || N->getOpcode() == ISD::ATOMIC_LOAD_OR || N->getOpcode() == ISD::ATOMIC_LOAD_XOR || N->getOpcode() == ISD::ATOMIC_LOAD_NAND || N->getOpcode() == ISD::ATOMIC_LOAD_MIN || N->getOpcode() == ISD::ATOMIC_LOAD_MAX || N->getOpcode() == ISD::ATOMIC_LOAD_UMIN || N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE; } }; /// MemIntrinsicSDNode - This SDNode is used for target intrinsics that touch /// memory and need an associated MachineMemOperand. Its opcode may be /// INTRINSIC_VOID, INTRINSIC_W_CHAIN, PREFETCH, or a target-specific opcode /// with a value not less than FIRST_TARGET_MEMORY_OPCODE. class MemIntrinsicSDNode : public MemSDNode { public: MemIntrinsicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops, EVT MemoryVT, MachineMemOperand *MMO) : MemSDNode(Opc, Order, dl, VTs, Ops, MemoryVT, MMO) { SubclassData |= 1u << 13; } // Methods to support isa and dyn_cast static bool classof(const SDNode *N) { // We lower some target intrinsics to their target opcode // early a node with a target opcode can be of this class return N->isMemIntrinsic() || N->getOpcode() == ISD::PREFETCH || N->isTargetMemoryOpcode(); } }; /// ShuffleVectorSDNode - This SDNode is used to implement the code generator /// support for the llvm IR shufflevector instruction. It combines elements /// from two input vectors into a new input vector, with the selection and /// ordering of elements determined by an array of integers, referred to as /// the shuffle mask. For input vectors of width N, mask indices of 0..N-1 /// refer to elements from the LHS input, and indices from N to 2N-1 the RHS. /// An index of -1 is treated as undef, such that the code generator may put /// any value in the corresponding element of the result. class ShuffleVectorSDNode : public SDNode { SDUse Ops[2]; // The memory for Mask is owned by the SelectionDAG's OperandAllocator, and // is freed when the SelectionDAG object is destroyed. const int *Mask; protected: friend class SelectionDAG; ShuffleVectorSDNode(EVT VT, unsigned Order, DebugLoc dl, SDValue N1, SDValue N2, const int *M) : SDNode(ISD::VECTOR_SHUFFLE, Order, dl, getSDVTList(VT)), Mask(M) { InitOperands(Ops, N1, N2); } public: ArrayRef getMask() const { EVT VT = getValueType(0); return makeArrayRef(Mask, VT.getVectorNumElements()); } int getMaskElt(unsigned Idx) const { assert(Idx < getValueType(0).getVectorNumElements() && "Idx out of range!"); return Mask[Idx]; } bool isSplat() const { return isSplatMask(Mask, getValueType(0)); } int getSplatIndex() const { assert(isSplat() && "Cannot get splat index for non-splat!"); EVT VT = getValueType(0); for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (Mask[i] >= 0) return Mask[i]; } llvm_unreachable("Splat with all undef indices?"); } static bool isSplatMask(const int *Mask, EVT VT); static bool classof(const SDNode *N) { return N->getOpcode() == ISD::VECTOR_SHUFFLE; } }; class ConstantSDNode : public SDNode { const ConstantInt *Value; friend class SelectionDAG; ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val, EVT VT) : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DebugLoc(), getSDVTList(VT)), Value(val) { SubclassData |= (uint16_t)isOpaque; } public: const ConstantInt *getConstantIntValue() const { return Value; } const APInt &getAPIntValue() const { return Value->getValue(); } uint64_t getZExtValue() const { return Value->getZExtValue(); } int64_t getSExtValue() const { return Value->getSExtValue(); } bool isOne() const { return Value->isOne(); } bool isNullValue() const { return Value->isNullValue(); } bool isAllOnesValue() const { return Value->isAllOnesValue(); } bool isOpaque() const { return SubclassData & 1; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::Constant || N->getOpcode() == ISD::TargetConstant; } }; class ConstantFPSDNode : public SDNode { const ConstantFP *Value; friend class SelectionDAG; ConstantFPSDNode(bool isTarget, const ConstantFP *val, EVT VT) : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0, DebugLoc(), getSDVTList(VT)), Value(val) { } public: const APFloat& getValueAPF() const { return Value->getValueAPF(); } const ConstantFP *getConstantFPValue() const { return Value; } /// isZero - Return true if the value is positive or negative zero. bool isZero() const { return Value->isZero(); } /// isNaN - Return true if the value is a NaN. bool isNaN() const { return Value->isNaN(); } /// isInfinity - Return true if the value is an infinity bool isInfinity() const { return Value->isInfinity(); } /// isNegative - Return true if the value is negative. bool isNegative() const { return Value->isNegative(); } /// isExactlyValue - We don't rely on operator== working on double values, as /// it returns true for things that are clearly not equal, like -0.0 and 0.0. /// As such, this method can be used to do an exact bit-for-bit comparison of /// two floating point values. /// We leave the version with the double argument here because it's just so /// convenient to write "2.0" and the like. Without this function we'd /// have to duplicate its logic everywhere it's called. bool isExactlyValue(double V) const { bool ignored; APFloat Tmp(V); Tmp.convert(Value->getValueAPF().getSemantics(), APFloat::rmNearestTiesToEven, &ignored); return isExactlyValue(Tmp); } bool isExactlyValue(const APFloat& V) const; static bool isValueValidForType(EVT VT, const APFloat& Val); static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ConstantFP || N->getOpcode() == ISD::TargetConstantFP; } }; class GlobalAddressSDNode : public SDNode { const GlobalValue *TheGlobal; int64_t Offset; unsigned char TargetFlags; friend class SelectionDAG; GlobalAddressSDNode(unsigned Opc, unsigned Order, DebugLoc DL, const GlobalValue *GA, EVT VT, int64_t o, unsigned char TargetFlags); public: const GlobalValue *getGlobal() const { return TheGlobal; } int64_t getOffset() const { return Offset; } unsigned char getTargetFlags() const { return TargetFlags; } // Return the address space this GlobalAddress belongs to. unsigned getAddressSpace() const; static bool classof(const SDNode *N) { return N->getOpcode() == ISD::GlobalAddress || N->getOpcode() == ISD::TargetGlobalAddress || N->getOpcode() == ISD::GlobalTLSAddress || N->getOpcode() == ISD::TargetGlobalTLSAddress; } }; class FrameIndexSDNode : public SDNode { int FI; friend class SelectionDAG; FrameIndexSDNode(int fi, EVT VT, bool isTarg) : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex, 0, DebugLoc(), getSDVTList(VT)), FI(fi) { } public: int getIndex() const { return FI; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::FrameIndex || N->getOpcode() == ISD::TargetFrameIndex; } }; class JumpTableSDNode : public SDNode { int JTI; unsigned char TargetFlags; friend class SelectionDAG; JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned char TF) : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable, 0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) { } public: int getIndex() const { return JTI; } unsigned char getTargetFlags() const { return TargetFlags; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::JumpTable || N->getOpcode() == ISD::TargetJumpTable; } }; class ConstantPoolSDNode : public SDNode { union { const Constant *ConstVal; MachineConstantPoolValue *MachineCPVal; } Val; int Offset; // It's a MachineConstantPoolValue if top bit is set. unsigned Alignment; // Minimum alignment requirement of CP (not log2 value). unsigned char TargetFlags; friend class SelectionDAG; ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o, unsigned Align, unsigned char TF) : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0, DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align), TargetFlags(TF) { assert(Offset >= 0 && "Offset is too large"); Val.ConstVal = c; } ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v, EVT VT, int o, unsigned Align, unsigned char TF) : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0, DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align), TargetFlags(TF) { assert(Offset >= 0 && "Offset is too large"); Val.MachineCPVal = v; Offset |= 1 << (sizeof(unsigned)*CHAR_BIT-1); } public: bool isMachineConstantPoolEntry() const { return Offset < 0; } const Constant *getConstVal() const { assert(!isMachineConstantPoolEntry() && "Wrong constantpool type"); return Val.ConstVal; } MachineConstantPoolValue *getMachineCPVal() const { assert(isMachineConstantPoolEntry() && "Wrong constantpool type"); return Val.MachineCPVal; } int getOffset() const { return Offset & ~(1 << (sizeof(unsigned)*CHAR_BIT-1)); } // Return the alignment of this constant pool object, which is either 0 (for // default alignment) or the desired value. unsigned getAlignment() const { return Alignment; } unsigned char getTargetFlags() const { return TargetFlags; } Type *getType() const; static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ConstantPool || N->getOpcode() == ISD::TargetConstantPool; } }; /// Completely target-dependent object reference. class TargetIndexSDNode : public SDNode { unsigned char TargetFlags; int Index; int64_t Offset; friend class SelectionDAG; public: TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned char TF) : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)), TargetFlags(TF), Index(Idx), Offset(Ofs) {} public: unsigned char getTargetFlags() const { return TargetFlags; } int getIndex() const { return Index; } int64_t getOffset() const { return Offset; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::TargetIndex; } }; class BasicBlockSDNode : public SDNode { MachineBasicBlock *MBB; friend class SelectionDAG; /// Debug info is meaningful and potentially useful here, but we create /// blocks out of order when they're jumped to, which makes it a bit /// harder. Let's see if we need it first. explicit BasicBlockSDNode(MachineBasicBlock *mbb) : SDNode(ISD::BasicBlock, 0, DebugLoc(), getSDVTList(MVT::Other)), MBB(mbb) {} public: MachineBasicBlock *getBasicBlock() const { return MBB; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::BasicBlock; } }; /// BuildVectorSDNode - A "pseudo-class" with methods for operating on /// BUILD_VECTORs. class BuildVectorSDNode : public SDNode { // These are constructed as SDNodes and then cast to BuildVectorSDNodes. explicit BuildVectorSDNode() LLVM_DELETED_FUNCTION; public: /// isConstantSplat - Check if this is a constant splat, and if so, find the /// smallest element size that splats the vector. If MinSplatBits is /// nonzero, the element size must be at least that large. Note that the /// splat element may be the entire vector (i.e., a one element vector). /// Returns the splat element value in SplatValue. Any undefined bits in /// that value are zero, and the corresponding bits in the SplatUndef mask /// are set. The SplatBitSize value is set to the splat element size in /// bits. HasAnyUndefs is set to true if any bits in the vector are /// undefined. isBigEndian describes the endianness of the target. bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits = 0, bool isBigEndian = false) const; /// \brief Returns the splatted value or a null value if this is not a splat. /// /// If passed a non-null UndefElements bitvector, it will resize it to match /// the vector width and set the bits where elements are undef. SDValue getSplatValue(BitVector *UndefElements = nullptr) const; /// \brief Returns the splatted constant or null if this is not a constant /// splat. /// /// If passed a non-null UndefElements bitvector, it will resize it to match /// the vector width and set the bits where elements are undef. ConstantSDNode * getConstantSplatNode(BitVector *UndefElements = nullptr) const; /// \brief Returns the splatted constant FP or null if this is not a constant /// FP splat. /// /// If passed a non-null UndefElements bitvector, it will resize it to match /// the vector width and set the bits where elements are undef. ConstantFPSDNode * getConstantFPSplatNode(BitVector *UndefElements = nullptr) const; bool isConstant() const; static inline bool classof(const SDNode *N) { return N->getOpcode() == ISD::BUILD_VECTOR; } }; /// SrcValueSDNode - An SDNode that holds an arbitrary LLVM IR Value. This is /// used when the SelectionDAG needs to make a simple reference to something /// in the LLVM IR representation. /// class SrcValueSDNode : public SDNode { const Value *V; friend class SelectionDAG; /// Create a SrcValue for a general value. explicit SrcValueSDNode(const Value *v) : SDNode(ISD::SRCVALUE, 0, DebugLoc(), getSDVTList(MVT::Other)), V(v) {} public: /// getValue - return the contained Value. const Value *getValue() const { return V; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::SRCVALUE; } }; class MDNodeSDNode : public SDNode { const MDNode *MD; friend class SelectionDAG; explicit MDNodeSDNode(const MDNode *md) : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md) {} public: const MDNode *getMD() const { return MD; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MDNODE_SDNODE; } }; class RegisterSDNode : public SDNode { unsigned Reg; friend class SelectionDAG; RegisterSDNode(unsigned reg, EVT VT) : SDNode(ISD::Register, 0, DebugLoc(), getSDVTList(VT)), Reg(reg) { } public: unsigned getReg() const { return Reg; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::Register; } }; class RegisterMaskSDNode : public SDNode { // The memory for RegMask is not owned by the node. const uint32_t *RegMask; friend class SelectionDAG; RegisterMaskSDNode(const uint32_t *mask) : SDNode(ISD::RegisterMask, 0, DebugLoc(), getSDVTList(MVT::Untyped)), RegMask(mask) {} public: const uint32_t *getRegMask() const { return RegMask; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::RegisterMask; } }; class BlockAddressSDNode : public SDNode { const BlockAddress *BA; int64_t Offset; unsigned char TargetFlags; friend class SelectionDAG; BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba, int64_t o, unsigned char Flags) : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)), BA(ba), Offset(o), TargetFlags(Flags) { } public: const BlockAddress *getBlockAddress() const { return BA; } int64_t getOffset() const { return Offset; } unsigned char getTargetFlags() const { return TargetFlags; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::BlockAddress || N->getOpcode() == ISD::TargetBlockAddress; } }; class EHLabelSDNode : public SDNode { SDUse Chain; MCSymbol *Label; friend class SelectionDAG; EHLabelSDNode(unsigned Order, DebugLoc dl, SDValue ch, MCSymbol *L) : SDNode(ISD::EH_LABEL, Order, dl, getSDVTList(MVT::Other)), Label(L) { InitOperands(&Chain, ch); } public: MCSymbol *getLabel() const { return Label; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::EH_LABEL; } }; class ExternalSymbolSDNode : public SDNode { const char *Symbol; unsigned char TargetFlags; friend class SelectionDAG; ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned char TF, EVT VT) : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Sym), TargetFlags(TF) { } public: const char *getSymbol() const { return Symbol; } unsigned char getTargetFlags() const { return TargetFlags; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ExternalSymbol || N->getOpcode() == ISD::TargetExternalSymbol; } }; class CondCodeSDNode : public SDNode { ISD::CondCode Condition; friend class SelectionDAG; explicit CondCodeSDNode(ISD::CondCode Cond) : SDNode(ISD::CONDCODE, 0, DebugLoc(), getSDVTList(MVT::Other)), Condition(Cond) { } public: ISD::CondCode get() const { return Condition; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::CONDCODE; } }; /// CvtRndSatSDNode - NOTE: avoid using this node as this may disappear in the /// future and most targets don't support it. class CvtRndSatSDNode : public SDNode { ISD::CvtCode CvtCode; friend class SelectionDAG; explicit CvtRndSatSDNode(EVT VT, unsigned Order, DebugLoc dl, ArrayRef Ops, ISD::CvtCode Code) : SDNode(ISD::CONVERT_RNDSAT, Order, dl, getSDVTList(VT), Ops), CvtCode(Code) { assert(Ops.size() == 5 && "wrong number of operations"); } public: ISD::CvtCode getCvtCode() const { return CvtCode; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::CONVERT_RNDSAT; } }; /// VTSDNode - This class is used to represent EVT's, which are used /// to parameterize some operations. class VTSDNode : public SDNode { EVT ValueType; friend class SelectionDAG; explicit VTSDNode(EVT VT) : SDNode(ISD::VALUETYPE, 0, DebugLoc(), getSDVTList(MVT::Other)), ValueType(VT) { } public: EVT getVT() const { return ValueType; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::VALUETYPE; } }; /// LSBaseSDNode - Base class for LoadSDNode and StoreSDNode /// class LSBaseSDNode : public MemSDNode { //! Operand array for load and store /*! \note Moving this array to the base class captures more common functionality shared between LoadSDNode and StoreSDNode */ SDUse Ops[4]; public: LSBaseSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, SDValue *Operands, unsigned numOperands, SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { SubclassData |= AM << 2; assert(getAddressingMode() == AM && "MemIndexedMode encoding error!"); InitOperands(Ops, Operands, numOperands); assert((getOffset().getOpcode() == ISD::UNDEF || isIndexed()) && "Only indexed loads and stores have a non-undef offset operand"); } const SDValue &getOffset() const { return getOperand(getOpcode() == ISD::LOAD ? 2 : 3); } /// getAddressingMode - Return the addressing mode for this load or store: /// unindexed, pre-inc, pre-dec, post-inc, or post-dec. ISD::MemIndexedMode getAddressingMode() const { return ISD::MemIndexedMode((SubclassData >> 2) & 7); } /// isIndexed - Return true if this is a pre/post inc/dec load/store. bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; } /// isUnindexed - Return true if this is NOT a pre/post inc/dec load/store. bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::LOAD || N->getOpcode() == ISD::STORE; } }; /// LoadSDNode - This class is used to represent ISD::LOAD nodes. /// class LoadSDNode : public LSBaseSDNode { friend class SelectionDAG; LoadSDNode(SDValue *ChainPtrOff, unsigned Order, DebugLoc dl, SDVTList VTs, ISD::MemIndexedMode AM, ISD::LoadExtType ETy, EVT MemVT, MachineMemOperand *MMO) : LSBaseSDNode(ISD::LOAD, Order, dl, ChainPtrOff, 3, VTs, AM, MemVT, MMO) { SubclassData |= (unsigned short)ETy; assert(getExtensionType() == ETy && "LoadExtType encoding error!"); assert(readMem() && "Load MachineMemOperand is not a load!"); assert(!writeMem() && "Load MachineMemOperand is a store!"); } public: /// getExtensionType - Return whether this is a plain node, /// or one of the varieties of value-extending loads. ISD::LoadExtType getExtensionType() const { return ISD::LoadExtType(SubclassData & 3); } const SDValue &getBasePtr() const { return getOperand(1); } const SDValue &getOffset() const { return getOperand(2); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::LOAD; } }; /// StoreSDNode - This class is used to represent ISD::STORE nodes. /// class StoreSDNode : public LSBaseSDNode { friend class SelectionDAG; StoreSDNode(SDValue *ChainValuePtrOff, unsigned Order, DebugLoc dl, SDVTList VTs, ISD::MemIndexedMode AM, bool isTrunc, EVT MemVT, MachineMemOperand *MMO) : LSBaseSDNode(ISD::STORE, Order, dl, ChainValuePtrOff, 4, VTs, AM, MemVT, MMO) { SubclassData |= (unsigned short)isTrunc; assert(isTruncatingStore() == isTrunc && "isTrunc encoding error!"); assert(!readMem() && "Store MachineMemOperand is a load!"); assert(writeMem() && "Store MachineMemOperand is not a store!"); } public: /// isTruncatingStore - Return true if the op does a truncation before store. /// For integers this is the same as doing a TRUNCATE and storing the result. /// For floats, it is the same as doing an FP_ROUND and storing the result. bool isTruncatingStore() const { return SubclassData & 1; } const SDValue &getValue() const { return getOperand(1); } const SDValue &getBasePtr() const { return getOperand(2); } const SDValue &getOffset() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::STORE; } }; /// MaskedLoadStoreSDNode - This is a base class is used to represent MLOAD and /// MSTORE nodes /// class MaskedLoadStoreSDNode : public MemSDNode { // Operands SDUse Ops[4]; public: friend class SelectionDAG; MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, SDValue *Operands, unsigned numOperands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { InitOperands(Ops, Operands, numOperands); } // In the both nodes address is Op1, mask is Op2: // MaskedLoadSDNode (Chain, ptr, mask, src0), src0 is a passthru value // MaskedStoreSDNode (Chain, ptr, mask, data) // Mask is a vector of i1 elements const SDValue &getBasePtr() const { return getOperand(1); } const SDValue &getMask() const { return getOperand(2); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MLOAD || N->getOpcode() == ISD::MSTORE; } }; /// MaskedLoadSDNode - This class is used to represent an MLOAD node /// class MaskedLoadSDNode : public MaskedLoadStoreSDNode { public: friend class SelectionDAG; - MaskedLoadSDNode(unsigned Order, DebugLoc dl, - SDValue *Operands, unsigned numOperands, - SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + MaskedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands, + unsigned numOperands, SDVTList VTs, ISD::LoadExtType ETy, + EVT MemVT, MachineMemOperand *MMO) : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands, - VTs, MemVT, MMO) - {} + VTs, MemVT, MMO) { + SubclassData |= (unsigned short)ETy; + } + ISD::LoadExtType getExtensionType() const { + return ISD::LoadExtType(SubclassData & 3); + } const SDValue &getSrc0() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MLOAD; } }; /// MaskedStoreSDNode - This class is used to represent an MSTORE node /// class MaskedStoreSDNode : public MaskedLoadStoreSDNode { public: friend class SelectionDAG; - MaskedStoreSDNode(unsigned Order, DebugLoc dl, - SDValue *Operands, unsigned numOperands, - SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + MaskedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands, + unsigned numOperands, SDVTList VTs, bool isTrunc, EVT MemVT, + MachineMemOperand *MMO) : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands, - VTs, MemVT, MMO) - {} + VTs, MemVT, MMO) { + SubclassData |= (unsigned short)isTrunc; + } + /// isTruncatingStore - Return true if the op does a truncation before store. + /// For integers this is the same as doing a TRUNCATE and storing the result. + /// For floats, it is the same as doing an FP_ROUND and storing the result. + bool isTruncatingStore() const { return SubclassData & 1; } - const SDValue &getData() const { return getOperand(3); } + const SDValue &getValue() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MSTORE; } }; /// MachineSDNode - An SDNode that represents everything that will be needed /// to construct a MachineInstr. These nodes are created during the /// instruction selection proper phase. /// class MachineSDNode : public SDNode { public: typedef MachineMemOperand **mmo_iterator; private: friend class SelectionDAG; MachineSDNode(unsigned Opc, unsigned Order, const DebugLoc DL, SDVTList VTs) : SDNode(Opc, Order, DL, VTs), MemRefs(nullptr), MemRefsEnd(nullptr) {} /// LocalOperands - Operands for this instruction, if they fit here. If /// they don't, this field is unused. SDUse LocalOperands[4]; /// MemRefs - Memory reference descriptions for this instruction. mmo_iterator MemRefs; mmo_iterator MemRefsEnd; public: mmo_iterator memoperands_begin() const { return MemRefs; } mmo_iterator memoperands_end() const { return MemRefsEnd; } bool memoperands_empty() const { return MemRefsEnd == MemRefs; } /// setMemRefs - Assign this MachineSDNodes's memory reference descriptor /// list. This does not transfer ownership. void setMemRefs(mmo_iterator NewMemRefs, mmo_iterator NewMemRefsEnd) { for (mmo_iterator MMI = NewMemRefs, MME = NewMemRefsEnd; MMI != MME; ++MMI) assert(*MMI && "Null mem ref detected!"); MemRefs = NewMemRefs; MemRefsEnd = NewMemRefsEnd; } static bool classof(const SDNode *N) { return N->isMachineOpcode(); } }; class SDNodeIterator : public std::iterator { const SDNode *Node; unsigned Operand; SDNodeIterator(const SDNode *N, unsigned Op) : Node(N), Operand(Op) {} public: bool operator==(const SDNodeIterator& x) const { return Operand == x.Operand; } bool operator!=(const SDNodeIterator& x) const { return !operator==(x); } const SDNodeIterator &operator=(const SDNodeIterator &I) { assert(I.Node == Node && "Cannot assign iterators to two different nodes!"); Operand = I.Operand; return *this; } pointer operator*() const { return Node->getOperand(Operand).getNode(); } pointer operator->() const { return operator*(); } SDNodeIterator& operator++() { // Preincrement ++Operand; return *this; } SDNodeIterator operator++(int) { // Postincrement SDNodeIterator tmp = *this; ++*this; return tmp; } size_t operator-(SDNodeIterator Other) const { assert(Node == Other.Node && "Cannot compare iterators of two different nodes!"); return Operand - Other.Operand; } static SDNodeIterator begin(const SDNode *N) { return SDNodeIterator(N, 0); } static SDNodeIterator end (const SDNode *N) { return SDNodeIterator(N, N->getNumOperands()); } unsigned getOperand() const { return Operand; } const SDNode *getNode() const { return Node; } }; template <> struct GraphTraits { typedef SDNode NodeType; typedef SDNodeIterator ChildIteratorType; static inline NodeType *getEntryNode(SDNode *N) { return N; } static inline ChildIteratorType child_begin(NodeType *N) { return SDNodeIterator::begin(N); } static inline ChildIteratorType child_end(NodeType *N) { return SDNodeIterator::end(N); } }; /// LargestSDNode - The largest SDNode class. /// typedef AtomicSDNode LargestSDNode; /// MostAlignedSDNode - The SDNode class with the greatest alignment /// requirement. /// typedef GlobalAddressSDNode MostAlignedSDNode; namespace ISD { /// isNormalLoad - Returns true if the specified node is a non-extending /// and unindexed load. inline bool isNormalLoad(const SDNode *N) { const LoadSDNode *Ld = dyn_cast(N); return Ld && Ld->getExtensionType() == ISD::NON_EXTLOAD && Ld->getAddressingMode() == ISD::UNINDEXED; } /// isNON_EXTLoad - Returns true if the specified node is a non-extending /// load. inline bool isNON_EXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::NON_EXTLOAD; } /// isEXTLoad - Returns true if the specified node is a EXTLOAD. /// inline bool isEXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::EXTLOAD; } /// isSEXTLoad - Returns true if the specified node is a SEXTLOAD. /// inline bool isSEXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::SEXTLOAD; } /// isZEXTLoad - Returns true if the specified node is a ZEXTLOAD. /// inline bool isZEXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::ZEXTLOAD; } /// isUNINDEXEDLoad - Returns true if the specified node is an unindexed load. /// inline bool isUNINDEXEDLoad(const SDNode *N) { return isa(N) && cast(N)->getAddressingMode() == ISD::UNINDEXED; } /// isNormalStore - Returns true if the specified node is a non-truncating /// and unindexed store. inline bool isNormalStore(const SDNode *N) { const StoreSDNode *St = dyn_cast(N); return St && !St->isTruncatingStore() && St->getAddressingMode() == ISD::UNINDEXED; } /// isNON_TRUNCStore - Returns true if the specified node is a non-truncating /// store. inline bool isNON_TRUNCStore(const SDNode *N) { return isa(N) && !cast(N)->isTruncatingStore(); } /// isTRUNCStore - Returns true if the specified node is a truncating /// store. inline bool isTRUNCStore(const SDNode *N) { return isa(N) && cast(N)->isTruncatingStore(); } /// isUNINDEXEDStore - Returns true if the specified node is an /// unindexed store. inline bool isUNINDEXEDStore(const SDNode *N) { return isa(N) && cast(N)->getAddressingMode() == ISD::UNINDEXED; } } } // end llvm namespace #endif Index: projects/clang360-import/contrib/llvm/include/llvm/IR/Constants.h =================================================================== --- projects/clang360-import/contrib/llvm/include/llvm/IR/Constants.h (revision 279024) +++ projects/clang360-import/contrib/llvm/include/llvm/IR/Constants.h (revision 279025) @@ -1,1209 +1,1215 @@ //===-- llvm/Constants.h - Constant class subclass definitions --*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // /// @file /// This file contains the declarations for the subclasses of Constant, /// which represent the different flavors of constant values that live in LLVM. /// Note that Constants are immutable (once created they never change) and are /// fully shared by structural equivalence. This means that two structurally /// equivalent constants will always have the same address. Constant's are /// created on demand as needed and never deleted: thus clients don't have to /// worry about the lifetime of the objects. // //===----------------------------------------------------------------------===// #ifndef LLVM_IR_CONSTANTS_H #define LLVM_IR_CONSTANTS_H #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/OperandTraits.h" namespace llvm { class ArrayType; class IntegerType; class StructType; class PointerType; class VectorType; class SequentialType; struct ConstantExprKeyType; template struct ConstantAggrKeyType; //===----------------------------------------------------------------------===// /// This is the shared class of boolean and integer constants. This class /// represents both boolean and integral constants. /// @brief Class for constant integers. class ConstantInt : public Constant { void anchor() override; void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantInt(const ConstantInt &) LLVM_DELETED_FUNCTION; ConstantInt(IntegerType *Ty, const APInt& V); APInt Val; protected: // allocate space for exactly zero operands void *operator new(size_t s) { return User::operator new(s, 0); } public: static ConstantInt *getTrue(LLVMContext &Context); static ConstantInt *getFalse(LLVMContext &Context); static Constant *getTrue(Type *Ty); static Constant *getFalse(Type *Ty); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantInt for the given value. static Constant *get(Type *Ty, uint64_t V, bool isSigned = false); /// Return a ConstantInt with the specified integer value for the specified /// type. If the type is wider than 64 bits, the value will be zero-extended /// to fit the type, unless isSigned is true, in which case the value will /// be interpreted as a 64-bit signed integer and sign-extended to fit /// the type. /// @brief Get a ConstantInt for a specific value. static ConstantInt *get(IntegerType *Ty, uint64_t V, bool isSigned = false); /// Return a ConstantInt with the specified value for the specified type. The /// value V will be canonicalized to a an unsigned APInt. Accessing it with /// either getSExtValue() or getZExtValue() will yield a correctly sized and /// signed value for the type Ty. /// @brief Get a ConstantInt for a specific signed value. static ConstantInt *getSigned(IntegerType *Ty, int64_t V); static Constant *getSigned(Type *Ty, int64_t V); /// Return a ConstantInt with the specified value and an implied Type. The /// type is the integer type that corresponds to the bit width of the value. static ConstantInt *get(LLVMContext &Context, const APInt &V); /// Return a ConstantInt constructed from the string strStart with the given /// radix. static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t radix); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantInt for the given value. static Constant *get(Type* Ty, const APInt& V); /// Return the constant as an APInt value reference. This allows clients to /// obtain a copy of the value, with all its precision in tact. /// @brief Return the constant's value. inline const APInt &getValue() const { return Val; } /// getBitWidth - Return the bitwidth of this constant. unsigned getBitWidth() const { return Val.getBitWidth(); } /// Return the constant as a 64-bit unsigned integer value after it /// has been zero extended as appropriate for the type of this constant. Note /// that this method can assert if the value does not fit in 64 bits. /// @brief Return the zero extended value. inline uint64_t getZExtValue() const { return Val.getZExtValue(); } /// Return the constant as a 64-bit integer value after it has been sign /// extended as appropriate for the type of this constant. Note that /// this method can assert if the value does not fit in 64 bits. /// @brief Return the sign extended value. inline int64_t getSExtValue() const { return Val.getSExtValue(); } /// A helper method that can be used to determine if the constant contained /// within is equal to a constant. This only works for very small values, /// because this is all that can be represented with all types. /// @brief Determine if this constant's value is same as an unsigned char. bool equalsInt(uint64_t V) const { return Val == V; } /// getType - Specialize the getType() method to always return an IntegerType, /// which reduces the amount of casting needed in parts of the compiler. /// inline IntegerType *getType() const { return cast(Value::getType()); } /// This static method returns true if the type Ty is big enough to /// represent the value V. This can be used to avoid having the get method /// assert when V is larger than Ty can represent. Note that there are two /// versions of this method, one for unsigned and one for signed integers. /// Although ConstantInt canonicalizes everything to an unsigned integer, /// the signed version avoids callers having to convert a signed quantity /// to the appropriate unsigned type before calling the method. /// @returns true if V is a valid value for type Ty /// @brief Determine if the value is in range for the given type. static bool isValueValidForType(Type *Ty, uint64_t V); static bool isValueValidForType(Type *Ty, int64_t V); bool isNegative() const { return Val.isNegative(); } /// This is just a convenience method to make client code smaller for a /// common code. It also correctly performs the comparison without the /// potential for an assertion from getZExtValue(). bool isZero() const { return Val == 0; } /// This is just a convenience method to make client code smaller for a /// common case. It also correctly performs the comparison without the /// potential for an assertion from getZExtValue(). /// @brief Determine if the value is one. bool isOne() const { return Val == 1; } /// This function will return true iff every bit in this constant is set /// to true. /// @returns true iff this constant's bits are all set to true. /// @brief Determine if the value is all ones. bool isMinusOne() const { return Val.isAllOnesValue(); } /// This function will return true iff this constant represents the largest /// value that may be represented by the constant's type. /// @returns true iff this is the largest value that may be represented /// by this type. /// @brief Determine if the value is maximal. bool isMaxValue(bool isSigned) const { if (isSigned) return Val.isMaxSignedValue(); else return Val.isMaxValue(); } /// This function will return true iff this constant represents the smallest /// value that may be represented by this constant's type. /// @returns true if this is the smallest value that may be represented by /// this type. /// @brief Determine if the value is minimal. bool isMinValue(bool isSigned) const { if (isSigned) return Val.isMinSignedValue(); else return Val.isMinValue(); } /// This function will return true iff this constant represents a value with /// active bits bigger than 64 bits or a value greater than the given uint64_t /// value. /// @returns true iff this constant is greater or equal to the given number. /// @brief Determine if the value is greater or equal to the given number. bool uge(uint64_t Num) const { return Val.getActiveBits() > 64 || Val.getZExtValue() >= Num; } /// getLimitedValue - If the value is smaller than the specified limit, /// return it, otherwise return the limit value. This causes the value /// to saturate to the limit. /// @returns the min of the value of the constant and the specified value /// @brief Get the constant's value with a saturation limit uint64_t getLimitedValue(uint64_t Limit = ~0ULL) const { return Val.getLimitedValue(Limit); } /// @brief Methods to support type inquiry through isa, cast, and dyn_cast. static bool classof(const Value *V) { return V->getValueID() == ConstantIntVal; } }; //===----------------------------------------------------------------------===// /// ConstantFP - Floating Point Values [float, double] /// class ConstantFP : public Constant { APFloat Val; void anchor() override; void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantFP(const ConstantFP &) LLVM_DELETED_FUNCTION; friend class LLVMContextImpl; protected: ConstantFP(Type *Ty, const APFloat& V); protected: // allocate space for exactly zero operands void *operator new(size_t s) { return User::operator new(s, 0); } public: /// Floating point negation must be implemented with f(x) = -0.0 - x. This /// method returns the negative zero constant for floating point or vector /// floating point types; for all other types, it returns the null value. static Constant *getZeroValueForNegation(Type *Ty); /// get() - This returns a ConstantFP, or a vector containing a splat of a /// ConstantFP, for the specified value in the specified type. This should /// only be used for simple constant values like 2.0/1.0 etc, that are /// known-valid both as host double and as the target format. static Constant *get(Type* Ty, double V); static Constant *get(Type* Ty, StringRef Str); static ConstantFP *get(LLVMContext &Context, const APFloat &V); static Constant *getNegativeZero(Type *Ty); static Constant *getInfinity(Type *Ty, bool Negative = false); /// isValueValidForType - return true if Ty is big enough to represent V. static bool isValueValidForType(Type *Ty, const APFloat &V); inline const APFloat &getValueAPF() const { return Val; } /// isZero - Return true if the value is positive or negative zero. bool isZero() const { return Val.isZero(); } /// isNegative - Return true if the sign bit is set. bool isNegative() const { return Val.isNegative(); } /// isInfinity - Return true if the value is infinity bool isInfinity() const { return Val.isInfinity(); } /// isNaN - Return true if the value is a NaN. bool isNaN() const { return Val.isNaN(); } /// isExactlyValue - We don't rely on operator== working on double values, as /// it returns true for things that are clearly not equal, like -0.0 and 0.0. /// As such, this method can be used to do an exact bit-for-bit comparison of /// two floating point values. The version with a double operand is retained /// because it's so convenient to write isExactlyValue(2.0), but please use /// it only for simple constants. bool isExactlyValue(const APFloat &V) const; bool isExactlyValue(double V) const { bool ignored; APFloat FV(V); FV.convert(Val.getSemantics(), APFloat::rmNearestTiesToEven, &ignored); return isExactlyValue(FV); } /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == ConstantFPVal; } }; //===----------------------------------------------------------------------===// /// ConstantAggregateZero - All zero aggregate value /// class ConstantAggregateZero : public Constant { void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantAggregateZero(const ConstantAggregateZero &) LLVM_DELETED_FUNCTION; protected: explicit ConstantAggregateZero(Type *ty) : Constant(ty, ConstantAggregateZeroVal, nullptr, 0) {} protected: // allocate space for exactly zero operands void *operator new(size_t s) { return User::operator new(s, 0); } public: static ConstantAggregateZero *get(Type *Ty); void destroyConstant() override; /// getSequentialElement - If this CAZ has array or vector type, return a zero /// with the right element type. Constant *getSequentialElement() const; /// getStructElement - If this CAZ has struct type, return a zero with the /// right element type for the specified element. Constant *getStructElement(unsigned Elt) const; /// getElementValue - Return a zero of the right value for the specified GEP /// index. Constant *getElementValue(Constant *C) const; /// getElementValue - Return a zero of the right value for the specified GEP /// index. Constant *getElementValue(unsigned Idx) const; + /// \brief Return the number of elements in the array, vector, or struct. + unsigned getNumElements() const; + /// Methods for support type inquiry through isa, cast, and dyn_cast: /// static bool classof(const Value *V) { return V->getValueID() == ConstantAggregateZeroVal; } }; //===----------------------------------------------------------------------===// /// ConstantArray - Constant Array Declarations /// class ConstantArray : public Constant { friend struct ConstantAggrKeyType; ConstantArray(const ConstantArray &) LLVM_DELETED_FUNCTION; protected: ConstantArray(ArrayType *T, ArrayRef Val); public: // ConstantArray accessors static Constant *get(ArrayType *T, ArrayRef V); private: static Constant *getImpl(ArrayType *T, ArrayRef V); public: /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); /// getType - Specialize the getType() method to always return an ArrayType, /// which reduces the amount of casting needed in parts of the compiler. /// inline ArrayType *getType() const { return cast(Value::getType()); } void destroyConstant() override; void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) override; /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == ConstantArrayVal; } }; template <> struct OperandTraits : public VariadicOperandTraits { }; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantArray, Constant) //===----------------------------------------------------------------------===// // ConstantStruct - Constant Struct Declarations // class ConstantStruct : public Constant { friend struct ConstantAggrKeyType; ConstantStruct(const ConstantStruct &) LLVM_DELETED_FUNCTION; protected: ConstantStruct(StructType *T, ArrayRef Val); public: // ConstantStruct accessors static Constant *get(StructType *T, ArrayRef V); static Constant *get(StructType *T, ...) LLVM_END_WITH_NULL; /// getAnon - Return an anonymous struct that has the specified /// elements. If the struct is possibly empty, then you must specify a /// context. static Constant *getAnon(ArrayRef V, bool Packed = false) { return get(getTypeForElements(V, Packed), V); } static Constant *getAnon(LLVMContext &Ctx, ArrayRef V, bool Packed = false) { return get(getTypeForElements(Ctx, V, Packed), V); } /// getTypeForElements - Return an anonymous struct type to use for a constant /// with the specified set of elements. The list must not be empty. static StructType *getTypeForElements(ArrayRef V, bool Packed = false); /// getTypeForElements - This version of the method allows an empty list. static StructType *getTypeForElements(LLVMContext &Ctx, ArrayRef V, bool Packed = false); /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); /// getType() specialization - Reduce amount of casting... /// inline StructType *getType() const { return cast(Value::getType()); } void destroyConstant() override; void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) override; /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == ConstantStructVal; } }; template <> struct OperandTraits : public VariadicOperandTraits { }; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantStruct, Constant) //===----------------------------------------------------------------------===// /// ConstantVector - Constant Vector Declarations /// class ConstantVector : public Constant { friend struct ConstantAggrKeyType; ConstantVector(const ConstantVector &) LLVM_DELETED_FUNCTION; protected: ConstantVector(VectorType *T, ArrayRef Val); public: // ConstantVector accessors static Constant *get(ArrayRef V); private: static Constant *getImpl(ArrayRef V); public: /// getSplat - Return a ConstantVector with the specified constant in each /// element. static Constant *getSplat(unsigned NumElts, Constant *Elt); /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); /// getType - Specialize the getType() method to always return a VectorType, /// which reduces the amount of casting needed in parts of the compiler. /// inline VectorType *getType() const { return cast(Value::getType()); } /// getSplatValue - If this is a splat constant, meaning that all of the /// elements have the same value, return that value. Otherwise return NULL. Constant *getSplatValue() const; void destroyConstant() override; void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) override; /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == ConstantVectorVal; } }; template <> struct OperandTraits : public VariadicOperandTraits { }; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantVector, Constant) //===----------------------------------------------------------------------===// /// ConstantPointerNull - a constant pointer value that points to null /// class ConstantPointerNull : public Constant { void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantPointerNull(const ConstantPointerNull &) LLVM_DELETED_FUNCTION; protected: explicit ConstantPointerNull(PointerType *T) : Constant(T, Value::ConstantPointerNullVal, nullptr, 0) {} protected: // allocate space for exactly zero operands void *operator new(size_t s) { return User::operator new(s, 0); } public: /// get() - Static factory methods - Return objects of the specified value static ConstantPointerNull *get(PointerType *T); void destroyConstant() override; /// getType - Specialize the getType() method to always return an PointerType, /// which reduces the amount of casting needed in parts of the compiler. /// inline PointerType *getType() const { return cast(Value::getType()); } /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == ConstantPointerNullVal; } }; //===----------------------------------------------------------------------===// /// ConstantDataSequential - A vector or array constant whose element type is a /// simple 1/2/4/8-byte integer or float/double, and whose elements are just /// simple data values (i.e. ConstantInt/ConstantFP). This Constant node has no /// operands because it stores all of the elements of the constant as densely /// packed data, instead of as Value*'s. /// /// This is the common base class of ConstantDataArray and ConstantDataVector. /// class ConstantDataSequential : public Constant { friend class LLVMContextImpl; /// DataElements - A pointer to the bytes underlying this constant (which is /// owned by the uniquing StringMap). const char *DataElements; /// Next - This forms a link list of ConstantDataSequential nodes that have /// the same value but different type. For example, 0,0,0,1 could be a 4 /// element array of i8, or a 1-element array of i32. They'll both end up in /// the same StringMap bucket, linked up. ConstantDataSequential *Next; void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantDataSequential(const ConstantDataSequential &) LLVM_DELETED_FUNCTION; protected: explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data) : Constant(ty, VT, nullptr, 0), DataElements(Data), Next(nullptr) {} ~ConstantDataSequential() { delete Next; } static Constant *getImpl(StringRef Bytes, Type *Ty); protected: // allocate space for exactly zero operands. void *operator new(size_t s) { return User::operator new(s, 0); } public: /// isElementTypeCompatible - Return true if a ConstantDataSequential can be /// formed with a vector or array of the specified element type. /// ConstantDataArray only works with normal float and int types that are /// stored densely in memory, not with things like i42 or x86_f80. static bool isElementTypeCompatible(const Type *Ty); /// getElementAsInteger - If this is a sequential container of integers (of /// any size), return the specified element in the low bits of a uint64_t. uint64_t getElementAsInteger(unsigned i) const; /// getElementAsAPFloat - If this is a sequential container of floating point /// type, return the specified element as an APFloat. APFloat getElementAsAPFloat(unsigned i) const; /// getElementAsFloat - If this is an sequential container of floats, return /// the specified element as a float. float getElementAsFloat(unsigned i) const; /// getElementAsDouble - If this is an sequential container of doubles, return /// the specified element as a double. double getElementAsDouble(unsigned i) const; /// getElementAsConstant - Return a Constant for a specified index's element. /// Note that this has to compute a new constant to return, so it isn't as /// efficient as getElementAsInteger/Float/Double. Constant *getElementAsConstant(unsigned i) const; /// getType - Specialize the getType() method to always return a /// SequentialType, which reduces the amount of casting needed in parts of the /// compiler. inline SequentialType *getType() const { return cast(Value::getType()); } /// getElementType - Return the element type of the array/vector. Type *getElementType() const; /// getNumElements - Return the number of elements in the array or vector. unsigned getNumElements() const; /// getElementByteSize - Return the size (in bytes) of each element in the /// array/vector. The size of the elements is known to be a multiple of one /// byte. uint64_t getElementByteSize() const; /// isString - This method returns true if this is an array of i8. bool isString() const; /// isCString - This method returns true if the array "isString", ends with a /// nul byte, and does not contains any other nul bytes. bool isCString() const; /// getAsString - If this array is isString(), then this method returns the /// array as a StringRef. Otherwise, it asserts out. /// StringRef getAsString() const { assert(isString() && "Not a string"); return getRawDataValues(); } /// getAsCString - If this array is isCString(), then this method returns the /// array (without the trailing null byte) as a StringRef. Otherwise, it /// asserts out. /// StringRef getAsCString() const { assert(isCString() && "Isn't a C string"); StringRef Str = getAsString(); return Str.substr(0, Str.size()-1); } /// getRawDataValues - Return the raw, underlying, bytes of this data. Note /// that this is an extremely tricky thing to work with, as it exposes the /// host endianness of the data elements. StringRef getRawDataValues() const; void destroyConstant() override; /// Methods for support type inquiry through isa, cast, and dyn_cast: /// static bool classof(const Value *V) { return V->getValueID() == ConstantDataArrayVal || V->getValueID() == ConstantDataVectorVal; } private: const char *getElementPointer(unsigned Elt) const; }; //===----------------------------------------------------------------------===// /// ConstantDataArray - An array constant whose element type is a simple /// 1/2/4/8-byte integer or float/double, and whose elements are just simple /// data values (i.e. ConstantInt/ConstantFP). This Constant node has no /// operands because it stores all of the elements of the constant as densely /// packed data, instead of as Value*'s. class ConstantDataArray : public ConstantDataSequential { void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantDataArray(const ConstantDataArray &) LLVM_DELETED_FUNCTION; void anchor() override; friend class ConstantDataSequential; explicit ConstantDataArray(Type *ty, const char *Data) : ConstantDataSequential(ty, ConstantDataArrayVal, Data) {} protected: // allocate space for exactly zero operands. void *operator new(size_t s) { return User::operator new(s, 0); } public: /// get() constructors - Return a constant with array type with an element /// count and element type matching the ArrayRef passed in. Note that this /// can return a ConstantAggregateZero object. static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); /// getString - This method constructs a CDS and initializes it with a text /// string. The default behavior (AddNull==true) causes a null terminator to /// be placed at the end of the array (increasing the length of the string by /// one more than the StringRef would normally indicate. Pass AddNull=false /// to disable this behavior. static Constant *getString(LLVMContext &Context, StringRef Initializer, bool AddNull = true); /// getType - Specialize the getType() method to always return an ArrayType, /// which reduces the amount of casting needed in parts of the compiler. /// inline ArrayType *getType() const { return cast(Value::getType()); } /// Methods for support type inquiry through isa, cast, and dyn_cast: /// static bool classof(const Value *V) { return V->getValueID() == ConstantDataArrayVal; } }; //===----------------------------------------------------------------------===// /// ConstantDataVector - A vector constant whose element type is a simple /// 1/2/4/8-byte integer or float/double, and whose elements are just simple /// data values (i.e. ConstantInt/ConstantFP). This Constant node has no /// operands because it stores all of the elements of the constant as densely /// packed data, instead of as Value*'s. class ConstantDataVector : public ConstantDataSequential { void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; ConstantDataVector(const ConstantDataVector &) LLVM_DELETED_FUNCTION; void anchor() override; friend class ConstantDataSequential; explicit ConstantDataVector(Type *ty, const char *Data) : ConstantDataSequential(ty, ConstantDataVectorVal, Data) {} protected: // allocate space for exactly zero operands. void *operator new(size_t s) { return User::operator new(s, 0); } public: /// get() constructors - Return a constant with vector type with an element /// count and element type matching the ArrayRef passed in. Note that this /// can return a ConstantAggregateZero object. static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); static Constant *get(LLVMContext &Context, ArrayRef Elts); /// getSplat - Return a ConstantVector with the specified constant in each /// element. The specified constant has to be a of a compatible type (i8/i16/ /// i32/i64/float/double) and must be a ConstantFP or ConstantInt. static Constant *getSplat(unsigned NumElts, Constant *Elt); /// getSplatValue - If this is a splat constant, meaning that all of the /// elements have the same value, return that value. Otherwise return NULL. Constant *getSplatValue() const; /// getType - Specialize the getType() method to always return a VectorType, /// which reduces the amount of casting needed in parts of the compiler. /// inline VectorType *getType() const { return cast(Value::getType()); } /// Methods for support type inquiry through isa, cast, and dyn_cast: /// static bool classof(const Value *V) { return V->getValueID() == ConstantDataVectorVal; } }; /// BlockAddress - The address of a basic block. /// class BlockAddress : public Constant { void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; void *operator new(size_t s) { return User::operator new(s, 2); } BlockAddress(Function *F, BasicBlock *BB); public: /// get - Return a BlockAddress for the specified function and basic block. static BlockAddress *get(Function *F, BasicBlock *BB); /// get - Return a BlockAddress for the specified basic block. The basic /// block must be embedded into a function. static BlockAddress *get(BasicBlock *BB); /// \brief Lookup an existing \c BlockAddress constant for the given /// BasicBlock. /// /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress. static BlockAddress *lookup(const BasicBlock *BB); /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); Function *getFunction() const { return (Function*)Op<0>().get(); } BasicBlock *getBasicBlock() const { return (BasicBlock*)Op<1>().get(); } void destroyConstant() override; void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) override; /// Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Value *V) { return V->getValueID() == BlockAddressVal; } }; template <> struct OperandTraits : public FixedNumOperandTraits { }; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BlockAddress, Value) //===----------------------------------------------------------------------===// /// ConstantExpr - a constant value that is initialized with an expression using /// other constant values. /// /// This class uses the standard Instruction opcodes to define the various /// constant expressions. The Opcode field for the ConstantExpr class is /// maintained in the Value::SubclassData field. class ConstantExpr : public Constant { friend struct ConstantExprKeyType; protected: ConstantExpr(Type *ty, unsigned Opcode, Use *Ops, unsigned NumOps) : Constant(ty, ConstantExprVal, Ops, NumOps) { // Operation type (an Instruction opcode) is stored as the SubclassData. setValueSubclassData(Opcode); } public: // Static methods to construct a ConstantExpr of different kinds. Note that // these methods may return a object that is not an instance of the // ConstantExpr class, because they will attempt to fold the constant // expression into something simpler if possible. /// getAlignOf constant expr - computes the alignment of a type in a target /// independent way (Note: the return type is an i64). static Constant *getAlignOf(Type *Ty); /// getSizeOf constant expr - computes the (alloc) size of a type (in /// address-units, not bits) in a target independent way (Note: the return /// type is an i64). /// static Constant *getSizeOf(Type *Ty); /// getOffsetOf constant expr - computes the offset of a struct field in a /// target independent way (Note: the return type is an i64). /// static Constant *getOffsetOf(StructType *STy, unsigned FieldNo); /// getOffsetOf constant expr - This is a generalized form of getOffsetOf, /// which supports any aggregate type, and any Constant index. /// static Constant *getOffsetOf(Type *Ty, Constant *FieldNo); static Constant *getNeg(Constant *C, bool HasNUW = false, bool HasNSW =false); static Constant *getFNeg(Constant *C); static Constant *getNot(Constant *C); static Constant *getAdd(Constant *C1, Constant *C2, bool HasNUW = false, bool HasNSW = false); static Constant *getFAdd(Constant *C1, Constant *C2); static Constant *getSub(Constant *C1, Constant *C2, bool HasNUW = false, bool HasNSW = false); static Constant *getFSub(Constant *C1, Constant *C2); static Constant *getMul(Constant *C1, Constant *C2, bool HasNUW = false, bool HasNSW = false); static Constant *getFMul(Constant *C1, Constant *C2); static Constant *getUDiv(Constant *C1, Constant *C2, bool isExact = false); static Constant *getSDiv(Constant *C1, Constant *C2, bool isExact = false); static Constant *getFDiv(Constant *C1, Constant *C2); static Constant *getURem(Constant *C1, Constant *C2); static Constant *getSRem(Constant *C1, Constant *C2); static Constant *getFRem(Constant *C1, Constant *C2); static Constant *getAnd(Constant *C1, Constant *C2); static Constant *getOr(Constant *C1, Constant *C2); static Constant *getXor(Constant *C1, Constant *C2); static Constant *getShl(Constant *C1, Constant *C2, bool HasNUW = false, bool HasNSW = false); static Constant *getLShr(Constant *C1, Constant *C2, bool isExact = false); static Constant *getAShr(Constant *C1, Constant *C2, bool isExact = false); static Constant *getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getSExt(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getZExt(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getFPTrunc(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getFPExtend(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getUIToFP(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getSIToFP(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getFPToUI(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getFPToSI(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getPtrToInt(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced = false); static Constant *getNSWNeg(Constant *C) { return getNeg(C, false, true); } static Constant *getNUWNeg(Constant *C) { return getNeg(C, true, false); } static Constant *getNSWAdd(Constant *C1, Constant *C2) { return getAdd(C1, C2, false, true); } static Constant *getNUWAdd(Constant *C1, Constant *C2) { return getAdd(C1, C2, true, false); } static Constant *getNSWSub(Constant *C1, Constant *C2) { return getSub(C1, C2, false, true); } static Constant *getNUWSub(Constant *C1, Constant *C2) { return getSub(C1, C2, true, false); } static Constant *getNSWMul(Constant *C1, Constant *C2) { return getMul(C1, C2, false, true); } static Constant *getNUWMul(Constant *C1, Constant *C2) { return getMul(C1, C2, true, false); } static Constant *getNSWShl(Constant *C1, Constant *C2) { return getShl(C1, C2, false, true); } static Constant *getNUWShl(Constant *C1, Constant *C2) { return getShl(C1, C2, true, false); } static Constant *getExactSDiv(Constant *C1, Constant *C2) { return getSDiv(C1, C2, true); } static Constant *getExactUDiv(Constant *C1, Constant *C2) { return getUDiv(C1, C2, true); } static Constant *getExactAShr(Constant *C1, Constant *C2) { return getAShr(C1, C2, true); } static Constant *getExactLShr(Constant *C1, Constant *C2) { return getLShr(C1, C2, true); } /// getBinOpIdentity - Return the identity for the given binary operation, /// i.e. a constant C such that X op C = X and C op X = X for every X. It /// returns null if the operator doesn't have an identity. static Constant *getBinOpIdentity(unsigned Opcode, Type *Ty); /// getBinOpAbsorber - Return the absorbing element for the given binary /// operation, i.e. a constant C such that X op C = C and C op X = C for /// every X. For example, this returns zero for integer multiplication. /// It returns null if the operator doesn't have an absorbing element. static Constant *getBinOpAbsorber(unsigned Opcode, Type *Ty); /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); /// \brief Convenience function for getting a Cast operation. /// /// \param ops The opcode for the conversion /// \param C The constant to be converted /// \param Ty The type to which the constant is converted /// \param OnlyIfReduced see \a getWithOperands() docs. static Constant *getCast(unsigned ops, Constant *C, Type *Ty, bool OnlyIfReduced = false); // @brief Create a ZExt or BitCast cast constant expression static Constant *getZExtOrBitCast( Constant *C, ///< The constant to zext or bitcast Type *Ty ///< The type to zext or bitcast C to ); // @brief Create a SExt or BitCast cast constant expression static Constant *getSExtOrBitCast( Constant *C, ///< The constant to sext or bitcast Type *Ty ///< The type to sext or bitcast C to ); // @brief Create a Trunc or BitCast cast constant expression static Constant *getTruncOrBitCast( Constant *C, ///< The constant to trunc or bitcast Type *Ty ///< The type to trunc or bitcast C to ); /// @brief Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant /// expression. static Constant *getPointerCast( Constant *C, ///< The pointer value to be casted (operand 0) Type *Ty ///< The type to which cast should be made ); /// @brief Create a BitCast or AddrSpaceCast for a pointer type depending on /// the address space. static Constant *getPointerBitCastOrAddrSpaceCast( Constant *C, ///< The constant to addrspacecast or bitcast Type *Ty ///< The type to bitcast or addrspacecast C to ); /// @brief Create a ZExt, Bitcast or Trunc for integer -> integer casts static Constant *getIntegerCast( Constant *C, ///< The integer constant to be casted Type *Ty, ///< The integer type to cast to bool isSigned ///< Whether C should be treated as signed or not ); /// @brief Create a FPExt, Bitcast or FPTrunc for fp -> fp casts static Constant *getFPCast( Constant *C, ///< The integer constant to be casted Type *Ty ///< The integer type to cast to ); /// @brief Return true if this is a convert constant expression bool isCast() const; /// @brief Return true if this is a compare constant expression bool isCompare() const; /// @brief Return true if this is an insertvalue or extractvalue expression, /// and the getIndices() method may be used. bool hasIndices() const; /// @brief Return true if this is a getelementptr expression and all /// the index operands are compile-time known integers within the /// corresponding notional static array extents. Note that this is /// not equivalant to, a subset of, or a superset of the "inbounds" /// property. bool isGEPWithNoNotionalOverIndexing() const; /// Select constant expr /// /// \param OnlyIfReducedTy see \a getWithOperands() docs. static Constant *getSelect(Constant *C, Constant *V1, Constant *V2, Type *OnlyIfReducedTy = nullptr); /// get - Return a binary or shift operator constant expression, /// folding if possible. /// /// \param OnlyIfReducedTy see \a getWithOperands() docs. static Constant *get(unsigned Opcode, Constant *C1, Constant *C2, unsigned Flags = 0, Type *OnlyIfReducedTy = nullptr); /// \brief Return an ICmp or FCmp comparison operator constant expression. /// /// \param OnlyIfReduced see \a getWithOperands() docs. static Constant *getCompare(unsigned short pred, Constant *C1, Constant *C2, bool OnlyIfReduced = false); /// get* - Return some common constants without having to /// specify the full Instruction::OPCODE identifier. /// static Constant *getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced = false); static Constant *getFCmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced = false); /// Getelementptr form. Value* is only accepted for convenience; /// all elements must be Constant's. /// /// \param OnlyIfReducedTy see \a getWithOperands() docs. static Constant *getGetElementPtr(Constant *C, ArrayRef IdxList, bool InBounds = false, Type *OnlyIfReducedTy = nullptr) { return getGetElementPtr( C, makeArrayRef((Value * const *)IdxList.data(), IdxList.size()), InBounds, OnlyIfReducedTy); } static Constant *getGetElementPtr(Constant *C, Constant *Idx, bool InBounds = false, Type *OnlyIfReducedTy = nullptr) { // This form of the function only exists to avoid ambiguous overload // warnings about whether to convert Idx to ArrayRef or // ArrayRef. return getGetElementPtr(C, cast(Idx), InBounds, OnlyIfReducedTy); } static Constant *getGetElementPtr(Constant *C, ArrayRef IdxList, bool InBounds = false, Type *OnlyIfReducedTy = nullptr); /// Create an "inbounds" getelementptr. See the documentation for the /// "inbounds" flag in LangRef.html for details. static Constant *getInBoundsGetElementPtr(Constant *C, ArrayRef IdxList) { return getGetElementPtr(C, IdxList, true); } static Constant *getInBoundsGetElementPtr(Constant *C, Constant *Idx) { // This form of the function only exists to avoid ambiguous overload // warnings about whether to convert Idx to ArrayRef or // ArrayRef. return getGetElementPtr(C, Idx, true); } static Constant *getInBoundsGetElementPtr(Constant *C, ArrayRef IdxList) { return getGetElementPtr(C, IdxList, true); } static Constant *getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy = nullptr); static Constant *getInsertElement(Constant *Vec, Constant *Elt, Constant *Idx, Type *OnlyIfReducedTy = nullptr); static Constant *getShuffleVector(Constant *V1, Constant *V2, Constant *Mask, Type *OnlyIfReducedTy = nullptr); static Constant *getExtractValue(Constant *Agg, ArrayRef Idxs, Type *OnlyIfReducedTy = nullptr); static Constant *getInsertValue(Constant *Agg, Constant *Val, ArrayRef Idxs, Type *OnlyIfReducedTy = nullptr); /// getOpcode - Return the opcode at the root of this constant expression unsigned getOpcode() const { return getSubclassDataFromValue(); } /// getPredicate - Return the ICMP or FCMP predicate value. Assert if this is /// not an ICMP or FCMP constant expression. unsigned getPredicate() const; /// getIndices - Assert that this is an insertvalue or exactvalue /// expression and return the list of indices. ArrayRef getIndices() const; /// getOpcodeName - Return a string representation for an opcode. const char *getOpcodeName() const; /// getWithOperandReplaced - Return a constant expression identical to this /// one, but with the specified operand set to the specified value. Constant *getWithOperandReplaced(unsigned OpNo, Constant *Op) const; /// getWithOperands - This returns the current constant expression with the /// operands replaced with the specified values. The specified array must /// have the same number of operands as our current one. Constant *getWithOperands(ArrayRef Ops) const { return getWithOperands(Ops, getType()); } /// \brief Get the current expression with the operands replaced. /// /// Return the current constant expression with the operands replaced with \c /// Ops and the type with \c Ty. The new operands must have the same number /// as the current ones. /// /// If \c OnlyIfReduced is \c true, nullptr will be returned unless something /// gets constant-folded, the type changes, or the expression is otherwise /// canonicalized. This parameter should almost always be \c false. Constant *getWithOperands(ArrayRef Ops, Type *Ty, bool OnlyIfReduced = false) const; /// getAsInstruction - Returns an Instruction which implements the same operation /// as this ConstantExpr. The instruction is not linked to any basic block. /// /// A better approach to this could be to have a constructor for Instruction /// which would take a ConstantExpr parameter, but that would have spread /// implementation details of ConstantExpr outside of Constants.cpp, which /// would make it harder to remove ConstantExprs altogether. Instruction *getAsInstruction(); void destroyConstant() override; void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) override; /// Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Value *V) { return V->getValueID() == ConstantExprVal; } private: // Shadow Value::setValueSubclassData with a private forwarding method so that // subclasses cannot accidentally use it. void setValueSubclassData(unsigned short D) { Value::setValueSubclassData(D); } }; template <> struct OperandTraits : public VariadicOperandTraits { }; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant) //===----------------------------------------------------------------------===// /// UndefValue - 'undef' values are things that do not have specified contents. /// These are used for a variety of purposes, including global variable /// initializers and operands to instructions. 'undef' values can occur with /// any first-class type. /// /// Undef values aren't exactly constants; if they have multiple uses, they /// can appear to have different bit patterns at each use. See /// LangRef.html#undefvalues for details. /// class UndefValue : public Constant { void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; UndefValue(const UndefValue &) LLVM_DELETED_FUNCTION; protected: explicit UndefValue(Type *T) : Constant(T, UndefValueVal, nullptr, 0) {} protected: // allocate space for exactly zero operands void *operator new(size_t s) { return User::operator new(s, 0); } public: /// get() - Static factory methods - Return an 'undef' object of the specified /// type. /// static UndefValue *get(Type *T); /// getSequentialElement - If this Undef has array or vector type, return a /// undef with the right element type. UndefValue *getSequentialElement() const; /// getStructElement - If this undef has struct type, return a undef with the /// right element type for the specified element. UndefValue *getStructElement(unsigned Elt) const; /// getElementValue - Return an undef of the right value for the specified GEP /// index. UndefValue *getElementValue(Constant *C) const; /// getElementValue - Return an undef of the right value for the specified GEP /// index. UndefValue *getElementValue(unsigned Idx) const; + + /// \brief Return the number of elements in the array, vector, or struct. + unsigned getNumElements() const; void destroyConstant() override; /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == UndefValueVal; } }; } // End llvm namespace #endif Index: projects/clang360-import/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp (revision 279025) @@ -1,3756 +1,3768 @@ //===- BitcodeReader.cpp - Internal BitcodeReader implementation ----------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "llvm/Bitcode/ReaderWriter.h" #include "BitcodeReader.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/IR/AutoUpgrade.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Operator.h" #include "llvm/Support/DataStream.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; enum { SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex }; BitcodeDiagnosticInfo::BitcodeDiagnosticInfo(std::error_code EC, DiagnosticSeverity Severity, const Twine &Msg) : DiagnosticInfo(DK_Bitcode, Severity), Msg(Msg), EC(EC) {} void BitcodeDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; } static std::error_code Error(DiagnosticHandlerFunction DiagnosticHandler, std::error_code EC, const Twine &Message) { BitcodeDiagnosticInfo DI(EC, DS_Error, Message); DiagnosticHandler(DI); return EC; } static std::error_code Error(DiagnosticHandlerFunction DiagnosticHandler, std::error_code EC) { return Error(DiagnosticHandler, EC, EC.message()); } std::error_code BitcodeReader::Error(BitcodeError E, const Twine &Message) { return ::Error(DiagnosticHandler, make_error_code(E), Message); } std::error_code BitcodeReader::Error(const Twine &Message) { return ::Error(DiagnosticHandler, make_error_code(BitcodeError::CorruptedBitcode), Message); } std::error_code BitcodeReader::Error(BitcodeError E) { return ::Error(DiagnosticHandler, make_error_code(E)); } static DiagnosticHandlerFunction getDiagHandler(DiagnosticHandlerFunction F, LLVMContext &C) { if (F) return F; return [&C](const DiagnosticInfo &DI) { C.diagnose(DI); }; } BitcodeReader::BitcodeReader(MemoryBuffer *buffer, LLVMContext &C, DiagnosticHandlerFunction DiagnosticHandler) : Context(C), DiagnosticHandler(getDiagHandler(DiagnosticHandler, C)), TheModule(nullptr), Buffer(buffer), LazyStreamer(nullptr), NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C), MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false), WillMaterializeAllForwardRefs(false) {} BitcodeReader::BitcodeReader(DataStreamer *streamer, LLVMContext &C, DiagnosticHandlerFunction DiagnosticHandler) : Context(C), DiagnosticHandler(getDiagHandler(DiagnosticHandler, C)), TheModule(nullptr), Buffer(nullptr), LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C), MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false), WillMaterializeAllForwardRefs(false) {} std::error_code BitcodeReader::materializeForwardReferencedFunctions() { if (WillMaterializeAllForwardRefs) return std::error_code(); // Prevent recursion. WillMaterializeAllForwardRefs = true; while (!BasicBlockFwdRefQueue.empty()) { Function *F = BasicBlockFwdRefQueue.front(); BasicBlockFwdRefQueue.pop_front(); assert(F && "Expected valid function"); if (!BasicBlockFwdRefs.count(F)) // Already materialized. continue; // Check for a function that isn't materializable to prevent an infinite // loop. When parsing a blockaddress stored in a global variable, there // isn't a trivial way to check if a function will have a body without a // linear search through FunctionsWithBodies, so just check it here. if (!F->isMaterializable()) return Error("Never resolved function from blockaddress"); // Try to materialize F. if (std::error_code EC = materialize(F)) return EC; } assert(BasicBlockFwdRefs.empty() && "Function missing from queue"); // Reset state. WillMaterializeAllForwardRefs = false; return std::error_code(); } void BitcodeReader::FreeState() { Buffer = nullptr; std::vector().swap(TypeList); ValueList.clear(); MDValueList.clear(); std::vector().swap(ComdatList); std::vector().swap(MAttributes); std::vector().swap(FunctionBBs); std::vector().swap(FunctionsWithBodies); DeferredFunctionInfo.clear(); MDKindMap.clear(); assert(BasicBlockFwdRefs.empty() && "Unresolved blockaddress fwd references"); BasicBlockFwdRefQueue.clear(); } //===----------------------------------------------------------------------===// // Helper functions to implement forward reference resolution, etc. //===----------------------------------------------------------------------===// /// ConvertToString - Convert a string from a record into an std::string, return /// true on failure. template static bool ConvertToString(ArrayRef Record, unsigned Idx, StrTy &Result) { if (Idx > Record.size()) return true; for (unsigned i = Idx, e = Record.size(); i != e; ++i) Result += (char)Record[i]; return false; } static GlobalValue::LinkageTypes getDecodedLinkage(unsigned Val) { switch (Val) { default: // Map unknown/new linkages to external case 0: return GlobalValue::ExternalLinkage; case 1: return GlobalValue::WeakAnyLinkage; case 2: return GlobalValue::AppendingLinkage; case 3: return GlobalValue::InternalLinkage; case 4: return GlobalValue::LinkOnceAnyLinkage; case 5: return GlobalValue::ExternalLinkage; // Obsolete DLLImportLinkage case 6: return GlobalValue::ExternalLinkage; // Obsolete DLLExportLinkage case 7: return GlobalValue::ExternalWeakLinkage; case 8: return GlobalValue::CommonLinkage; case 9: return GlobalValue::PrivateLinkage; case 10: return GlobalValue::WeakODRLinkage; case 11: return GlobalValue::LinkOnceODRLinkage; case 12: return GlobalValue::AvailableExternallyLinkage; case 13: return GlobalValue::PrivateLinkage; // Obsolete LinkerPrivateLinkage case 14: return GlobalValue::PrivateLinkage; // Obsolete LinkerPrivateWeakLinkage case 15: return GlobalValue::ExternalLinkage; // Obsolete LinkOnceODRAutoHideLinkage } } static GlobalValue::VisibilityTypes GetDecodedVisibility(unsigned Val) { switch (Val) { default: // Map unknown visibilities to default. case 0: return GlobalValue::DefaultVisibility; case 1: return GlobalValue::HiddenVisibility; case 2: return GlobalValue::ProtectedVisibility; } } static GlobalValue::DLLStorageClassTypes GetDecodedDLLStorageClass(unsigned Val) { switch (Val) { default: // Map unknown values to default. case 0: return GlobalValue::DefaultStorageClass; case 1: return GlobalValue::DLLImportStorageClass; case 2: return GlobalValue::DLLExportStorageClass; } } static GlobalVariable::ThreadLocalMode GetDecodedThreadLocalMode(unsigned Val) { switch (Val) { case 0: return GlobalVariable::NotThreadLocal; default: // Map unknown non-zero value to general dynamic. case 1: return GlobalVariable::GeneralDynamicTLSModel; case 2: return GlobalVariable::LocalDynamicTLSModel; case 3: return GlobalVariable::InitialExecTLSModel; case 4: return GlobalVariable::LocalExecTLSModel; } } static int GetDecodedCastOpcode(unsigned Val) { switch (Val) { default: return -1; case bitc::CAST_TRUNC : return Instruction::Trunc; case bitc::CAST_ZEXT : return Instruction::ZExt; case bitc::CAST_SEXT : return Instruction::SExt; case bitc::CAST_FPTOUI : return Instruction::FPToUI; case bitc::CAST_FPTOSI : return Instruction::FPToSI; case bitc::CAST_UITOFP : return Instruction::UIToFP; case bitc::CAST_SITOFP : return Instruction::SIToFP; case bitc::CAST_FPTRUNC : return Instruction::FPTrunc; case bitc::CAST_FPEXT : return Instruction::FPExt; case bitc::CAST_PTRTOINT: return Instruction::PtrToInt; case bitc::CAST_INTTOPTR: return Instruction::IntToPtr; case bitc::CAST_BITCAST : return Instruction::BitCast; case bitc::CAST_ADDRSPACECAST: return Instruction::AddrSpaceCast; } } static int GetDecodedBinaryOpcode(unsigned Val, Type *Ty) { switch (Val) { default: return -1; case bitc::BINOP_ADD: return Ty->isFPOrFPVectorTy() ? Instruction::FAdd : Instruction::Add; case bitc::BINOP_SUB: return Ty->isFPOrFPVectorTy() ? Instruction::FSub : Instruction::Sub; case bitc::BINOP_MUL: return Ty->isFPOrFPVectorTy() ? Instruction::FMul : Instruction::Mul; case bitc::BINOP_UDIV: return Instruction::UDiv; case bitc::BINOP_SDIV: return Ty->isFPOrFPVectorTy() ? Instruction::FDiv : Instruction::SDiv; case bitc::BINOP_UREM: return Instruction::URem; case bitc::BINOP_SREM: return Ty->isFPOrFPVectorTy() ? Instruction::FRem : Instruction::SRem; case bitc::BINOP_SHL: return Instruction::Shl; case bitc::BINOP_LSHR: return Instruction::LShr; case bitc::BINOP_ASHR: return Instruction::AShr; case bitc::BINOP_AND: return Instruction::And; case bitc::BINOP_OR: return Instruction::Or; case bitc::BINOP_XOR: return Instruction::Xor; } } static AtomicRMWInst::BinOp GetDecodedRMWOperation(unsigned Val) { switch (Val) { default: return AtomicRMWInst::BAD_BINOP; case bitc::RMW_XCHG: return AtomicRMWInst::Xchg; case bitc::RMW_ADD: return AtomicRMWInst::Add; case bitc::RMW_SUB: return AtomicRMWInst::Sub; case bitc::RMW_AND: return AtomicRMWInst::And; case bitc::RMW_NAND: return AtomicRMWInst::Nand; case bitc::RMW_OR: return AtomicRMWInst::Or; case bitc::RMW_XOR: return AtomicRMWInst::Xor; case bitc::RMW_MAX: return AtomicRMWInst::Max; case bitc::RMW_MIN: return AtomicRMWInst::Min; case bitc::RMW_UMAX: return AtomicRMWInst::UMax; case bitc::RMW_UMIN: return AtomicRMWInst::UMin; } } static AtomicOrdering GetDecodedOrdering(unsigned Val) { switch (Val) { case bitc::ORDERING_NOTATOMIC: return NotAtomic; case bitc::ORDERING_UNORDERED: return Unordered; case bitc::ORDERING_MONOTONIC: return Monotonic; case bitc::ORDERING_ACQUIRE: return Acquire; case bitc::ORDERING_RELEASE: return Release; case bitc::ORDERING_ACQREL: return AcquireRelease; default: // Map unknown orderings to sequentially-consistent. case bitc::ORDERING_SEQCST: return SequentiallyConsistent; } } static SynchronizationScope GetDecodedSynchScope(unsigned Val) { switch (Val) { case bitc::SYNCHSCOPE_SINGLETHREAD: return SingleThread; default: // Map unknown scopes to cross-thread. case bitc::SYNCHSCOPE_CROSSTHREAD: return CrossThread; } } static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) { switch (Val) { default: // Map unknown selection kinds to any. case bitc::COMDAT_SELECTION_KIND_ANY: return Comdat::Any; case bitc::COMDAT_SELECTION_KIND_EXACT_MATCH: return Comdat::ExactMatch; case bitc::COMDAT_SELECTION_KIND_LARGEST: return Comdat::Largest; case bitc::COMDAT_SELECTION_KIND_NO_DUPLICATES: return Comdat::NoDuplicates; case bitc::COMDAT_SELECTION_KIND_SAME_SIZE: return Comdat::SameSize; } } static void UpgradeDLLImportExportLinkage(llvm::GlobalValue *GV, unsigned Val) { switch (Val) { case 5: GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); break; case 6: GV->setDLLStorageClass(GlobalValue::DLLExportStorageClass); break; } } namespace llvm { namespace { /// @brief A class for maintaining the slot number definition /// as a placeholder for the actual definition for forward constants defs. class ConstantPlaceHolder : public ConstantExpr { void operator=(const ConstantPlaceHolder &) LLVM_DELETED_FUNCTION; public: // allocate space for exactly one operand void *operator new(size_t s) { return User::operator new(s, 1); } explicit ConstantPlaceHolder(Type *Ty, LLVMContext& Context) : ConstantExpr(Ty, Instruction::UserOp1, &Op<0>(), 1) { Op<0>() = UndefValue::get(Type::getInt32Ty(Context)); } /// @brief Methods to support type inquiry through isa, cast, and dyn_cast. static bool classof(const Value *V) { return isa(V) && cast(V)->getOpcode() == Instruction::UserOp1; } /// Provide fast operand accessors DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); }; } // FIXME: can we inherit this from ConstantExpr? template <> struct OperandTraits : public FixedNumOperandTraits { }; DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPlaceHolder, Value) } void BitcodeReaderValueList::AssignValue(Value *V, unsigned Idx) { if (Idx == size()) { push_back(V); return; } if (Idx >= size()) resize(Idx+1); WeakVH &OldV = ValuePtrs[Idx]; if (!OldV) { OldV = V; return; } // Handle constants and non-constants (e.g. instrs) differently for // efficiency. if (Constant *PHC = dyn_cast(&*OldV)) { ResolveConstants.push_back(std::make_pair(PHC, Idx)); OldV = V; } else { // If there was a forward reference to this value, replace it. Value *PrevVal = OldV; OldV->replaceAllUsesWith(V); delete PrevVal; } } Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty) { if (Idx >= size()) resize(Idx + 1); if (Value *V = ValuePtrs[Idx]) { assert(Ty == V->getType() && "Type mismatch in constant table!"); return cast(V); } // Create and return a placeholder, which will later be RAUW'd. Constant *C = new ConstantPlaceHolder(Ty, Context); ValuePtrs[Idx] = C; return C; } Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) { if (Idx >= size()) resize(Idx + 1); if (Value *V = ValuePtrs[Idx]) { assert((!Ty || Ty == V->getType()) && "Type mismatch in value table!"); return V; } // No type specified, must be invalid reference. if (!Ty) return nullptr; // Create and return a placeholder, which will later be RAUW'd. Value *V = new Argument(Ty); ValuePtrs[Idx] = V; return V; } /// ResolveConstantForwardRefs - Once all constants are read, this method bulk /// resolves any forward references. The idea behind this is that we sometimes /// get constants (such as large arrays) which reference *many* forward ref /// constants. Replacing each of these causes a lot of thrashing when /// building/reuniquing the constant. Instead of doing this, we look at all the /// uses and rewrite all the place holders at once for any constant that uses /// a placeholder. void BitcodeReaderValueList::ResolveConstantForwardRefs() { // Sort the values by-pointer so that they are efficient to look up with a // binary search. std::sort(ResolveConstants.begin(), ResolveConstants.end()); SmallVector NewOps; while (!ResolveConstants.empty()) { Value *RealVal = operator[](ResolveConstants.back().second); Constant *Placeholder = ResolveConstants.back().first; ResolveConstants.pop_back(); // Loop over all users of the placeholder, updating them to reference the // new value. If they reference more than one placeholder, update them all // at once. while (!Placeholder->use_empty()) { auto UI = Placeholder->user_begin(); User *U = *UI; // If the using object isn't uniqued, just update the operands. This // handles instructions and initializers for global variables. if (!isa(U) || isa(U)) { UI.getUse().set(RealVal); continue; } // Otherwise, we have a constant that uses the placeholder. Replace that // constant with a new constant that has *all* placeholder uses updated. Constant *UserC = cast(U); for (User::op_iterator I = UserC->op_begin(), E = UserC->op_end(); I != E; ++I) { Value *NewOp; if (!isa(*I)) { // Not a placeholder reference. NewOp = *I; } else if (*I == Placeholder) { // Common case is that it just references this one placeholder. NewOp = RealVal; } else { // Otherwise, look up the placeholder in ResolveConstants. ResolveConstantsTy::iterator It = std::lower_bound(ResolveConstants.begin(), ResolveConstants.end(), std::pair(cast(*I), 0)); assert(It != ResolveConstants.end() && It->first == *I); NewOp = operator[](It->second); } NewOps.push_back(cast(NewOp)); } // Make the new constant. Constant *NewC; if (ConstantArray *UserCA = dyn_cast(UserC)) { NewC = ConstantArray::get(UserCA->getType(), NewOps); } else if (ConstantStruct *UserCS = dyn_cast(UserC)) { NewC = ConstantStruct::get(UserCS->getType(), NewOps); } else if (isa(UserC)) { NewC = ConstantVector::get(NewOps); } else { assert(isa(UserC) && "Must be a ConstantExpr."); NewC = cast(UserC)->getWithOperands(NewOps); } UserC->replaceAllUsesWith(NewC); UserC->destroyConstant(); NewOps.clear(); } // Update all ValueHandles, they should be the only users at this point. Placeholder->replaceAllUsesWith(RealVal); delete Placeholder; } } void BitcodeReaderMDValueList::AssignValue(Metadata *MD, unsigned Idx) { if (Idx == size()) { push_back(MD); return; } if (Idx >= size()) resize(Idx+1); TrackingMDRef &OldMD = MDValuePtrs[Idx]; if (!OldMD) { OldMD.reset(MD); return; } // If there was a forward reference to this value, replace it. MDNodeFwdDecl *PrevMD = cast(OldMD.get()); PrevMD->replaceAllUsesWith(MD); MDNode::deleteTemporary(PrevMD); --NumFwdRefs; } Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) { if (Idx >= size()) resize(Idx + 1); if (Metadata *MD = MDValuePtrs[Idx]) return MD; - // Create and return a placeholder, which will later be RAUW'd. - AnyFwdRefs = true; + // Track forward refs to be resolved later. + if (AnyFwdRefs) { + MinFwdRef = std::min(MinFwdRef, Idx); + MaxFwdRef = std::max(MaxFwdRef, Idx); + } else { + AnyFwdRefs = true; + MinFwdRef = MaxFwdRef = Idx; + } ++NumFwdRefs; + + // Create and return a placeholder, which will later be RAUW'd. Metadata *MD = MDNode::getTemporary(Context, None); MDValuePtrs[Idx].reset(MD); return MD; } void BitcodeReaderMDValueList::tryToResolveCycles() { if (!AnyFwdRefs) // Nothing to do. return; if (NumFwdRefs) // Still forward references... can't resolve cycles. return; // Resolve any cycles. - for (auto &MD : MDValuePtrs) { + for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) { + auto &MD = MDValuePtrs[I]; assert(!(MD && isa(MD)) && "Unexpected forward reference"); if (auto *N = dyn_cast_or_null(MD)) N->resolveCycles(); } + + // Make sure we return early again until there's another forward ref. + AnyFwdRefs = false; } Type *BitcodeReader::getTypeByID(unsigned ID) { // The type table size is always specified correctly. if (ID >= TypeList.size()) return nullptr; if (Type *Ty = TypeList[ID]) return Ty; // If we have a forward reference, the only possible case is when it is to a // named struct. Just create a placeholder for now. return TypeList[ID] = createIdentifiedStructType(Context); } StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context, StringRef Name) { auto *Ret = StructType::create(Context, Name); IdentifiedStructTypes.push_back(Ret); return Ret; } StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context) { auto *Ret = StructType::create(Context); IdentifiedStructTypes.push_back(Ret); return Ret; } //===----------------------------------------------------------------------===// // Functions for parsing blocks from the bitcode file //===----------------------------------------------------------------------===// /// \brief This fills an AttrBuilder object with the LLVM attributes that have /// been decoded from the given integer. This function must stay in sync with /// 'encodeLLVMAttributesForBitcode'. static void decodeLLVMAttributesForBitcode(AttrBuilder &B, uint64_t EncodedAttrs) { // FIXME: Remove in 4.0. // The alignment is stored as a 16-bit raw value from bits 31--16. We shift // the bits above 31 down by 11 bits. unsigned Alignment = (EncodedAttrs & (0xffffULL << 16)) >> 16; assert((!Alignment || isPowerOf2_32(Alignment)) && "Alignment must be a power of two."); if (Alignment) B.addAlignmentAttr(Alignment); B.addRawValue(((EncodedAttrs & (0xfffffULL << 32)) >> 11) | (EncodedAttrs & 0xffff)); } std::error_code BitcodeReader::ParseAttributeBlock() { if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID)) return Error("Invalid record"); if (!MAttributes.empty()) return Error("Invalid multiple blocks"); SmallVector Record; SmallVector Attrs; // Read all the records. while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); switch (Stream.readRecord(Entry.ID, Record)) { default: // Default behavior: ignore. break; case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...] // FIXME: Remove in 4.0. if (Record.size() & 1) return Error("Invalid record"); for (unsigned i = 0, e = Record.size(); i != e; i += 2) { AttrBuilder B; decodeLLVMAttributesForBitcode(B, Record[i+1]); Attrs.push_back(AttributeSet::get(Context, Record[i], B)); } MAttributes.push_back(AttributeSet::get(Context, Attrs)); Attrs.clear(); break; } case bitc::PARAMATTR_CODE_ENTRY: { // ENTRY: [attrgrp0, attrgrp1, ...] for (unsigned i = 0, e = Record.size(); i != e; ++i) Attrs.push_back(MAttributeGroups[Record[i]]); MAttributes.push_back(AttributeSet::get(Context, Attrs)); Attrs.clear(); break; } } } } // Returns Attribute::None on unrecognized codes. static Attribute::AttrKind GetAttrFromCode(uint64_t Code) { switch (Code) { default: return Attribute::None; case bitc::ATTR_KIND_ALIGNMENT: return Attribute::Alignment; case bitc::ATTR_KIND_ALWAYS_INLINE: return Attribute::AlwaysInline; case bitc::ATTR_KIND_BUILTIN: return Attribute::Builtin; case bitc::ATTR_KIND_BY_VAL: return Attribute::ByVal; case bitc::ATTR_KIND_IN_ALLOCA: return Attribute::InAlloca; case bitc::ATTR_KIND_COLD: return Attribute::Cold; case bitc::ATTR_KIND_INLINE_HINT: return Attribute::InlineHint; case bitc::ATTR_KIND_IN_REG: return Attribute::InReg; case bitc::ATTR_KIND_JUMP_TABLE: return Attribute::JumpTable; case bitc::ATTR_KIND_MIN_SIZE: return Attribute::MinSize; case bitc::ATTR_KIND_NAKED: return Attribute::Naked; case bitc::ATTR_KIND_NEST: return Attribute::Nest; case bitc::ATTR_KIND_NO_ALIAS: return Attribute::NoAlias; case bitc::ATTR_KIND_NO_BUILTIN: return Attribute::NoBuiltin; case bitc::ATTR_KIND_NO_CAPTURE: return Attribute::NoCapture; case bitc::ATTR_KIND_NO_DUPLICATE: return Attribute::NoDuplicate; case bitc::ATTR_KIND_NO_IMPLICIT_FLOAT: return Attribute::NoImplicitFloat; case bitc::ATTR_KIND_NO_INLINE: return Attribute::NoInline; case bitc::ATTR_KIND_NON_LAZY_BIND: return Attribute::NonLazyBind; case bitc::ATTR_KIND_NON_NULL: return Attribute::NonNull; case bitc::ATTR_KIND_DEREFERENCEABLE: return Attribute::Dereferenceable; case bitc::ATTR_KIND_NO_RED_ZONE: return Attribute::NoRedZone; case bitc::ATTR_KIND_NO_RETURN: return Attribute::NoReturn; case bitc::ATTR_KIND_NO_UNWIND: return Attribute::NoUnwind; case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE: return Attribute::OptimizeForSize; case bitc::ATTR_KIND_OPTIMIZE_NONE: return Attribute::OptimizeNone; case bitc::ATTR_KIND_READ_NONE: return Attribute::ReadNone; case bitc::ATTR_KIND_READ_ONLY: return Attribute::ReadOnly; case bitc::ATTR_KIND_RETURNED: return Attribute::Returned; case bitc::ATTR_KIND_RETURNS_TWICE: return Attribute::ReturnsTwice; case bitc::ATTR_KIND_S_EXT: return Attribute::SExt; case bitc::ATTR_KIND_STACK_ALIGNMENT: return Attribute::StackAlignment; case bitc::ATTR_KIND_STACK_PROTECT: return Attribute::StackProtect; case bitc::ATTR_KIND_STACK_PROTECT_REQ: return Attribute::StackProtectReq; case bitc::ATTR_KIND_STACK_PROTECT_STRONG: return Attribute::StackProtectStrong; case bitc::ATTR_KIND_STRUCT_RET: return Attribute::StructRet; case bitc::ATTR_KIND_SANITIZE_ADDRESS: return Attribute::SanitizeAddress; case bitc::ATTR_KIND_SANITIZE_THREAD: return Attribute::SanitizeThread; case bitc::ATTR_KIND_SANITIZE_MEMORY: return Attribute::SanitizeMemory; case bitc::ATTR_KIND_UW_TABLE: return Attribute::UWTable; case bitc::ATTR_KIND_Z_EXT: return Attribute::ZExt; } } std::error_code BitcodeReader::ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind) { *Kind = GetAttrFromCode(Code); if (*Kind == Attribute::None) return Error(BitcodeError::CorruptedBitcode, "Unknown attribute kind (" + Twine(Code) + ")"); return std::error_code(); } std::error_code BitcodeReader::ParseAttributeGroupBlock() { if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID)) return Error("Invalid record"); if (!MAttributeGroups.empty()) return Error("Invalid multiple blocks"); SmallVector Record; // Read all the records. while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); switch (Stream.readRecord(Entry.ID, Record)) { default: // Default behavior: ignore. break; case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...] if (Record.size() < 3) return Error("Invalid record"); uint64_t GrpID = Record[0]; uint64_t Idx = Record[1]; // Index of the object this attribute refers to. AttrBuilder B; for (unsigned i = 2, e = Record.size(); i != e; ++i) { if (Record[i] == 0) { // Enum attribute Attribute::AttrKind Kind; if (std::error_code EC = ParseAttrKind(Record[++i], &Kind)) return EC; B.addAttribute(Kind); } else if (Record[i] == 1) { // Integer attribute Attribute::AttrKind Kind; if (std::error_code EC = ParseAttrKind(Record[++i], &Kind)) return EC; if (Kind == Attribute::Alignment) B.addAlignmentAttr(Record[++i]); else if (Kind == Attribute::StackAlignment) B.addStackAlignmentAttr(Record[++i]); else if (Kind == Attribute::Dereferenceable) B.addDereferenceableAttr(Record[++i]); } else { // String attribute assert((Record[i] == 3 || Record[i] == 4) && "Invalid attribute group entry"); bool HasValue = (Record[i++] == 4); SmallString<64> KindStr; SmallString<64> ValStr; while (Record[i] != 0 && i != e) KindStr += Record[i++]; assert(Record[i] == 0 && "Kind string not null terminated"); if (HasValue) { // Has a value associated with it. ++i; // Skip the '0' that terminates the "kind" string. while (Record[i] != 0 && i != e) ValStr += Record[i++]; assert(Record[i] == 0 && "Value string not null terminated"); } B.addAttribute(KindStr.str(), ValStr.str()); } } MAttributeGroups[GrpID] = AttributeSet::get(Context, Idx, B); break; } } } } std::error_code BitcodeReader::ParseTypeTable() { if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW)) return Error("Invalid record"); return ParseTypeTableBody(); } std::error_code BitcodeReader::ParseTypeTableBody() { if (!TypeList.empty()) return Error("Invalid multiple blocks"); SmallVector Record; unsigned NumRecords = 0; SmallString<64> TypeName; // Read all the records for this type table. while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: if (NumRecords != TypeList.size()) return Error("Malformed block"); return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); Type *ResultTy = nullptr; switch (Stream.readRecord(Entry.ID, Record)) { default: return Error("Invalid value"); case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries] // TYPE_CODE_NUMENTRY contains a count of the number of types in the // type list. This allows us to reserve space. if (Record.size() < 1) return Error("Invalid record"); TypeList.resize(Record[0]); continue; case bitc::TYPE_CODE_VOID: // VOID ResultTy = Type::getVoidTy(Context); break; case bitc::TYPE_CODE_HALF: // HALF ResultTy = Type::getHalfTy(Context); break; case bitc::TYPE_CODE_FLOAT: // FLOAT ResultTy = Type::getFloatTy(Context); break; case bitc::TYPE_CODE_DOUBLE: // DOUBLE ResultTy = Type::getDoubleTy(Context); break; case bitc::TYPE_CODE_X86_FP80: // X86_FP80 ResultTy = Type::getX86_FP80Ty(Context); break; case bitc::TYPE_CODE_FP128: // FP128 ResultTy = Type::getFP128Ty(Context); break; case bitc::TYPE_CODE_PPC_FP128: // PPC_FP128 ResultTy = Type::getPPC_FP128Ty(Context); break; case bitc::TYPE_CODE_LABEL: // LABEL ResultTy = Type::getLabelTy(Context); break; case bitc::TYPE_CODE_METADATA: // METADATA ResultTy = Type::getMetadataTy(Context); break; case bitc::TYPE_CODE_X86_MMX: // X86_MMX ResultTy = Type::getX86_MMXTy(Context); break; case bitc::TYPE_CODE_INTEGER: // INTEGER: [width] if (Record.size() < 1) return Error("Invalid record"); ResultTy = IntegerType::get(Context, Record[0]); break; case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or // [pointee type, address space] if (Record.size() < 1) return Error("Invalid record"); unsigned AddressSpace = 0; if (Record.size() == 2) AddressSpace = Record[1]; ResultTy = getTypeByID(Record[0]); if (!ResultTy) return Error("Invalid type"); ResultTy = PointerType::get(ResultTy, AddressSpace); break; } case bitc::TYPE_CODE_FUNCTION_OLD: { // FIXME: attrid is dead, remove it in LLVM 4.0 // FUNCTION: [vararg, attrid, retty, paramty x N] if (Record.size() < 3) return Error("Invalid record"); SmallVector ArgTys; for (unsigned i = 3, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) ArgTys.push_back(T); else break; } ResultTy = getTypeByID(Record[2]); if (!ResultTy || ArgTys.size() < Record.size()-3) return Error("Invalid type"); ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]); break; } case bitc::TYPE_CODE_FUNCTION: { // FUNCTION: [vararg, retty, paramty x N] if (Record.size() < 2) return Error("Invalid record"); SmallVector ArgTys; for (unsigned i = 2, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) ArgTys.push_back(T); else break; } ResultTy = getTypeByID(Record[1]); if (!ResultTy || ArgTys.size() < Record.size()-2) return Error("Invalid type"); ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]); break; } case bitc::TYPE_CODE_STRUCT_ANON: { // STRUCT: [ispacked, eltty x N] if (Record.size() < 1) return Error("Invalid record"); SmallVector EltTys; for (unsigned i = 1, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) EltTys.push_back(T); else break; } if (EltTys.size() != Record.size()-1) return Error("Invalid type"); ResultTy = StructType::get(Context, EltTys, Record[0]); break; } case bitc::TYPE_CODE_STRUCT_NAME: // STRUCT_NAME: [strchr x N] if (ConvertToString(Record, 0, TypeName)) return Error("Invalid record"); continue; case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N] if (Record.size() < 1) return Error("Invalid record"); if (NumRecords >= TypeList.size()) return Error("Invalid TYPE table"); // Check to see if this was forward referenced, if so fill in the temp. StructType *Res = cast_or_null(TypeList[NumRecords]); if (Res) { Res->setName(TypeName); TypeList[NumRecords] = nullptr; } else // Otherwise, create a new struct. Res = createIdentifiedStructType(Context, TypeName); TypeName.clear(); SmallVector EltTys; for (unsigned i = 1, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) EltTys.push_back(T); else break; } if (EltTys.size() != Record.size()-1) return Error("Invalid record"); Res->setBody(EltTys, Record[0]); ResultTy = Res; break; } case bitc::TYPE_CODE_OPAQUE: { // OPAQUE: [] if (Record.size() != 1) return Error("Invalid record"); if (NumRecords >= TypeList.size()) return Error("Invalid TYPE table"); // Check to see if this was forward referenced, if so fill in the temp. StructType *Res = cast_or_null(TypeList[NumRecords]); if (Res) { Res->setName(TypeName); TypeList[NumRecords] = nullptr; } else // Otherwise, create a new struct with no body. Res = createIdentifiedStructType(Context, TypeName); TypeName.clear(); ResultTy = Res; break; } case bitc::TYPE_CODE_ARRAY: // ARRAY: [numelts, eltty] if (Record.size() < 2) return Error("Invalid record"); if ((ResultTy = getTypeByID(Record[1]))) ResultTy = ArrayType::get(ResultTy, Record[0]); else return Error("Invalid type"); break; case bitc::TYPE_CODE_VECTOR: // VECTOR: [numelts, eltty] if (Record.size() < 2) return Error("Invalid record"); if ((ResultTy = getTypeByID(Record[1]))) ResultTy = VectorType::get(ResultTy, Record[0]); else return Error("Invalid type"); break; } if (NumRecords >= TypeList.size()) return Error("Invalid TYPE table"); assert(ResultTy && "Didn't read a type?"); assert(!TypeList[NumRecords] && "Already read type?"); TypeList[NumRecords++] = ResultTy; } } std::error_code BitcodeReader::ParseValueSymbolTable() { if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID)) return Error("Invalid record"); SmallVector Record; // Read all the records for this value table. SmallString<128> ValueName; while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); switch (Stream.readRecord(Entry.ID, Record)) { default: // Default behavior: unknown type. break; case bitc::VST_CODE_ENTRY: { // VST_ENTRY: [valueid, namechar x N] if (ConvertToString(Record, 1, ValueName)) return Error("Invalid record"); unsigned ValueID = Record[0]; if (ValueID >= ValueList.size() || !ValueList[ValueID]) return Error("Invalid record"); Value *V = ValueList[ValueID]; V->setName(StringRef(ValueName.data(), ValueName.size())); ValueName.clear(); break; } case bitc::VST_CODE_BBENTRY: { if (ConvertToString(Record, 1, ValueName)) return Error("Invalid record"); BasicBlock *BB = getBasicBlock(Record[0]); if (!BB) return Error("Invalid record"); BB->setName(StringRef(ValueName.data(), ValueName.size())); ValueName.clear(); break; } } } } std::error_code BitcodeReader::ParseMetadata() { unsigned NextMDValueNo = MDValueList.size(); if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID)) return Error("Invalid record"); SmallVector Record; // Read all the records. while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: MDValueList.tryToResolveCycles(); return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); unsigned Code = Stream.readRecord(Entry.ID, Record); bool IsDistinct = false; switch (Code) { default: // Default behavior: ignore. break; case bitc::METADATA_NAME: { // Read name of the named metadata. SmallString<8> Name(Record.begin(), Record.end()); Record.clear(); Code = Stream.ReadCode(); // METADATA_NAME is always followed by METADATA_NAMED_NODE. unsigned NextBitCode = Stream.readRecord(Code, Record); assert(NextBitCode == bitc::METADATA_NAMED_NODE); (void)NextBitCode; // Read named metadata elements. unsigned Size = Record.size(); NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name); for (unsigned i = 0; i != Size; ++i) { MDNode *MD = dyn_cast_or_null(MDValueList.getValueFwdRef(Record[i])); if (!MD) return Error("Invalid record"); NMD->addOperand(MD); } break; } case bitc::METADATA_OLD_FN_NODE: { // FIXME: Remove in 4.0. // This is a LocalAsMetadata record, the only type of function-local // metadata. if (Record.size() % 2 == 1) return Error("Invalid record"); // If this isn't a LocalAsMetadata record, we're dropping it. This used // to be legal, but there's no upgrade path. auto dropRecord = [&] { MDValueList.AssignValue(MDNode::get(Context, None), NextMDValueNo++); }; if (Record.size() != 2) { dropRecord(); break; } Type *Ty = getTypeByID(Record[0]); if (Ty->isMetadataTy() || Ty->isVoidTy()) { dropRecord(); break; } MDValueList.AssignValue( LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)), NextMDValueNo++); break; } case bitc::METADATA_OLD_NODE: { // FIXME: Remove in 4.0. if (Record.size() % 2 == 1) return Error("Invalid record"); unsigned Size = Record.size(); SmallVector Elts; for (unsigned i = 0; i != Size; i += 2) { Type *Ty = getTypeByID(Record[i]); if (!Ty) return Error("Invalid record"); if (Ty->isMetadataTy()) Elts.push_back(MDValueList.getValueFwdRef(Record[i+1])); else if (!Ty->isVoidTy()) { auto *MD = ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty)); assert(isa(MD) && "Expected non-function-local metadata"); Elts.push_back(MD); } else Elts.push_back(nullptr); } MDValueList.AssignValue(MDNode::get(Context, Elts), NextMDValueNo++); break; } case bitc::METADATA_VALUE: { if (Record.size() != 2) return Error("Invalid record"); Type *Ty = getTypeByID(Record[0]); if (Ty->isMetadataTy() || Ty->isVoidTy()) return Error("Invalid record"); MDValueList.AssignValue( ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)), NextMDValueNo++); break; } case bitc::METADATA_DISTINCT_NODE: IsDistinct = true; // fallthrough... case bitc::METADATA_NODE: { SmallVector Elts; Elts.reserve(Record.size()); for (unsigned ID : Record) Elts.push_back(ID ? MDValueList.getValueFwdRef(ID - 1) : nullptr); MDValueList.AssignValue(IsDistinct ? MDNode::getDistinct(Context, Elts) : MDNode::get(Context, Elts), NextMDValueNo++); break; } case bitc::METADATA_LOCATION: { if (Record.size() != 5) return Error("Invalid record"); auto get = Record[0] ? MDLocation::getDistinct : MDLocation::get; unsigned Line = Record[1]; unsigned Column = Record[2]; MDNode *Scope = cast(MDValueList.getValueFwdRef(Record[3])); Metadata *InlinedAt = Record[4] ? MDValueList.getValueFwdRef(Record[4] - 1) : nullptr; MDValueList.AssignValue(get(Context, Line, Column, Scope, InlinedAt), NextMDValueNo++); break; } case bitc::METADATA_STRING: { std::string String(Record.begin(), Record.end()); llvm::UpgradeMDStringConstant(String); Metadata *MD = MDString::get(Context, String); MDValueList.AssignValue(MD, NextMDValueNo++); break; } case bitc::METADATA_KIND: { if (Record.size() < 2) return Error("Invalid record"); unsigned Kind = Record[0]; SmallString<8> Name(Record.begin()+1, Record.end()); unsigned NewKind = TheModule->getMDKindID(Name.str()); if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second) return Error("Conflicting METADATA_KIND records"); break; } } } } /// decodeSignRotatedValue - Decode a signed value stored with the sign bit in /// the LSB for dense VBR encoding. uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) { if ((V & 1) == 0) return V >> 1; if (V != 1) return -(V >> 1); // There is no such thing as -0 with integers. "-0" really means MININT. return 1ULL << 63; } /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global /// values and aliases that we can. std::error_code BitcodeReader::ResolveGlobalAndAliasInits() { std::vector > GlobalInitWorklist; std::vector > AliasInitWorklist; std::vector > FunctionPrefixWorklist; std::vector > FunctionPrologueWorklist; GlobalInitWorklist.swap(GlobalInits); AliasInitWorklist.swap(AliasInits); FunctionPrefixWorklist.swap(FunctionPrefixes); FunctionPrologueWorklist.swap(FunctionPrologues); while (!GlobalInitWorklist.empty()) { unsigned ValID = GlobalInitWorklist.back().second; if (ValID >= ValueList.size()) { // Not ready to resolve this yet, it requires something later in the file. GlobalInits.push_back(GlobalInitWorklist.back()); } else { if (Constant *C = dyn_cast_or_null(ValueList[ValID])) GlobalInitWorklist.back().first->setInitializer(C); else return Error("Expected a constant"); } GlobalInitWorklist.pop_back(); } while (!AliasInitWorklist.empty()) { unsigned ValID = AliasInitWorklist.back().second; if (ValID >= ValueList.size()) { AliasInits.push_back(AliasInitWorklist.back()); } else { if (Constant *C = dyn_cast_or_null(ValueList[ValID])) AliasInitWorklist.back().first->setAliasee(C); else return Error("Expected a constant"); } AliasInitWorklist.pop_back(); } while (!FunctionPrefixWorklist.empty()) { unsigned ValID = FunctionPrefixWorklist.back().second; if (ValID >= ValueList.size()) { FunctionPrefixes.push_back(FunctionPrefixWorklist.back()); } else { if (Constant *C = dyn_cast_or_null(ValueList[ValID])) FunctionPrefixWorklist.back().first->setPrefixData(C); else return Error("Expected a constant"); } FunctionPrefixWorklist.pop_back(); } while (!FunctionPrologueWorklist.empty()) { unsigned ValID = FunctionPrologueWorklist.back().second; if (ValID >= ValueList.size()) { FunctionPrologues.push_back(FunctionPrologueWorklist.back()); } else { if (Constant *C = dyn_cast_or_null(ValueList[ValID])) FunctionPrologueWorklist.back().first->setPrologueData(C); else return Error("Expected a constant"); } FunctionPrologueWorklist.pop_back(); } return std::error_code(); } static APInt ReadWideAPInt(ArrayRef Vals, unsigned TypeBits) { SmallVector Words(Vals.size()); std::transform(Vals.begin(), Vals.end(), Words.begin(), BitcodeReader::decodeSignRotatedValue); return APInt(TypeBits, Words); } std::error_code BitcodeReader::ParseConstants() { if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID)) return Error("Invalid record"); SmallVector Record; // Read all the records for this value table. Type *CurTy = Type::getInt32Ty(Context); unsigned NextCstNo = ValueList.size(); while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: if (NextCstNo != ValueList.size()) return Error("Invalid ronstant reference"); // Once all the constants have been read, go through and resolve forward // references. ValueList.ResolveConstantForwardRefs(); return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); Value *V = nullptr; unsigned BitCode = Stream.readRecord(Entry.ID, Record); switch (BitCode) { default: // Default behavior: unknown constant case bitc::CST_CODE_UNDEF: // UNDEF V = UndefValue::get(CurTy); break; case bitc::CST_CODE_SETTYPE: // SETTYPE: [typeid] if (Record.empty()) return Error("Invalid record"); if (Record[0] >= TypeList.size() || !TypeList[Record[0]]) return Error("Invalid record"); CurTy = TypeList[Record[0]]; continue; // Skip the ValueList manipulation. case bitc::CST_CODE_NULL: // NULL V = Constant::getNullValue(CurTy); break; case bitc::CST_CODE_INTEGER: // INTEGER: [intval] if (!CurTy->isIntegerTy() || Record.empty()) return Error("Invalid record"); V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0])); break; case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval] if (!CurTy->isIntegerTy() || Record.empty()) return Error("Invalid record"); APInt VInt = ReadWideAPInt(Record, cast(CurTy)->getBitWidth()); V = ConstantInt::get(Context, VInt); break; } case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval] if (Record.empty()) return Error("Invalid record"); if (CurTy->isHalfTy()) V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf, APInt(16, (uint16_t)Record[0]))); else if (CurTy->isFloatTy()) V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle, APInt(32, (uint32_t)Record[0]))); else if (CurTy->isDoubleTy()) V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble, APInt(64, Record[0]))); else if (CurTy->isX86_FP80Ty()) { // Bits are not stored the same way as a normal i80 APInt, compensate. uint64_t Rearrange[2]; Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16); Rearrange[1] = Record[0] >> 48; V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended, APInt(80, Rearrange))); } else if (CurTy->isFP128Ty()) V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad, APInt(128, Record))); else if (CurTy->isPPC_FP128Ty()) V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble, APInt(128, Record))); else V = UndefValue::get(CurTy); break; } case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number] if (Record.empty()) return Error("Invalid record"); unsigned Size = Record.size(); SmallVector Elts; if (StructType *STy = dyn_cast(CurTy)) { for (unsigned i = 0; i != Size; ++i) Elts.push_back(ValueList.getConstantFwdRef(Record[i], STy->getElementType(i))); V = ConstantStruct::get(STy, Elts); } else if (ArrayType *ATy = dyn_cast(CurTy)) { Type *EltTy = ATy->getElementType(); for (unsigned i = 0; i != Size; ++i) Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy)); V = ConstantArray::get(ATy, Elts); } else if (VectorType *VTy = dyn_cast(CurTy)) { Type *EltTy = VTy->getElementType(); for (unsigned i = 0; i != Size; ++i) Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy)); V = ConstantVector::get(Elts); } else { V = UndefValue::get(CurTy); } break; } case bitc::CST_CODE_STRING: // STRING: [values] case bitc::CST_CODE_CSTRING: { // CSTRING: [values] if (Record.empty()) return Error("Invalid record"); SmallString<16> Elts(Record.begin(), Record.end()); V = ConstantDataArray::getString(Context, Elts, BitCode == bitc::CST_CODE_CSTRING); break; } case bitc::CST_CODE_DATA: {// DATA: [n x value] if (Record.empty()) return Error("Invalid record"); Type *EltTy = cast(CurTy)->getElementType(); unsigned Size = Record.size(); if (EltTy->isIntegerTy(8)) { SmallVector Elts(Record.begin(), Record.end()); if (isa(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isIntegerTy(16)) { SmallVector Elts(Record.begin(), Record.end()); if (isa(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isIntegerTy(32)) { SmallVector Elts(Record.begin(), Record.end()); if (isa(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isIntegerTy(64)) { SmallVector Elts(Record.begin(), Record.end()); if (isa(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isFloatTy()) { SmallVector Elts(Size); std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat); if (isa(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isDoubleTy()) { SmallVector Elts(Size); std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToDouble); if (isa(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else { return Error("Invalid type for value"); } break; } case bitc::CST_CODE_CE_BINOP: { // CE_BINOP: [opcode, opval, opval] if (Record.size() < 3) return Error("Invalid record"); int Opc = GetDecodedBinaryOpcode(Record[0], CurTy); if (Opc < 0) { V = UndefValue::get(CurTy); // Unknown binop. } else { Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy); Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy); unsigned Flags = 0; if (Record.size() >= 4) { if (Opc == Instruction::Add || Opc == Instruction::Sub || Opc == Instruction::Mul || Opc == Instruction::Shl) { if (Record[3] & (1 << bitc::OBO_NO_SIGNED_WRAP)) Flags |= OverflowingBinaryOperator::NoSignedWrap; if (Record[3] & (1 << bitc::OBO_NO_UNSIGNED_WRAP)) Flags |= OverflowingBinaryOperator::NoUnsignedWrap; } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv || Opc == Instruction::LShr || Opc == Instruction::AShr) { if (Record[3] & (1 << bitc::PEO_EXACT)) Flags |= SDivOperator::IsExact; } } V = ConstantExpr::get(Opc, LHS, RHS, Flags); } break; } case bitc::CST_CODE_CE_CAST: { // CE_CAST: [opcode, opty, opval] if (Record.size() < 3) return Error("Invalid record"); int Opc = GetDecodedCastOpcode(Record[0]); if (Opc < 0) { V = UndefValue::get(CurTy); // Unknown cast. } else { Type *OpTy = getTypeByID(Record[1]); if (!OpTy) return Error("Invalid record"); Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy); V = UpgradeBitCastExpr(Opc, Op, CurTy); if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy); } break; } case bitc::CST_CODE_CE_INBOUNDS_GEP: case bitc::CST_CODE_CE_GEP: { // CE_GEP: [n x operands] if (Record.size() & 1) return Error("Invalid record"); SmallVector Elts; for (unsigned i = 0, e = Record.size(); i != e; i += 2) { Type *ElTy = getTypeByID(Record[i]); if (!ElTy) return Error("Invalid record"); Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy)); } ArrayRef Indices(Elts.begin() + 1, Elts.end()); V = ConstantExpr::getGetElementPtr(Elts[0], Indices, BitCode == bitc::CST_CODE_CE_INBOUNDS_GEP); break; } case bitc::CST_CODE_CE_SELECT: { // CE_SELECT: [opval#, opval#, opval#] if (Record.size() < 3) return Error("Invalid record"); Type *SelectorTy = Type::getInt1Ty(Context); // If CurTy is a vector of length n, then Record[0] must be a // vector. Otherwise, it must be a single bit. if (VectorType *VTy = dyn_cast(CurTy)) SelectorTy = VectorType::get(Type::getInt1Ty(Context), VTy->getNumElements()); V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0], SelectorTy), ValueList.getConstantFwdRef(Record[1],CurTy), ValueList.getConstantFwdRef(Record[2],CurTy)); break; } case bitc::CST_CODE_CE_EXTRACTELT : { // CE_EXTRACTELT: [opty, opval, opty, opval] if (Record.size() < 3) return Error("Invalid record"); VectorType *OpTy = dyn_cast_or_null(getTypeByID(Record[0])); if (!OpTy) return Error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy); Constant *Op1 = nullptr; if (Record.size() == 4) { Type *IdxTy = getTypeByID(Record[2]); if (!IdxTy) return Error("Invalid record"); Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy); } else // TODO: Remove with llvm 4.0 Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context)); if (!Op1) return Error("Invalid record"); V = ConstantExpr::getExtractElement(Op0, Op1); break; } case bitc::CST_CODE_CE_INSERTELT : { // CE_INSERTELT: [opval, opval, opty, opval] VectorType *OpTy = dyn_cast(CurTy); if (Record.size() < 3 || !OpTy) return Error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy); Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy->getElementType()); Constant *Op2 = nullptr; if (Record.size() == 4) { Type *IdxTy = getTypeByID(Record[2]); if (!IdxTy) return Error("Invalid record"); Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy); } else // TODO: Remove with llvm 4.0 Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context)); if (!Op2) return Error("Invalid record"); V = ConstantExpr::getInsertElement(Op0, Op1, Op2); break; } case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval] VectorType *OpTy = dyn_cast(CurTy); if (Record.size() < 3 || !OpTy) return Error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy); Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy); Type *ShufTy = VectorType::get(Type::getInt32Ty(Context), OpTy->getNumElements()); Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy); V = ConstantExpr::getShuffleVector(Op0, Op1, Op2); break; } case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval] VectorType *RTy = dyn_cast(CurTy); VectorType *OpTy = dyn_cast_or_null(getTypeByID(Record[0])); if (Record.size() < 4 || !RTy || !OpTy) return Error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy); Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy); Type *ShufTy = VectorType::get(Type::getInt32Ty(Context), RTy->getNumElements()); Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy); V = ConstantExpr::getShuffleVector(Op0, Op1, Op2); break; } case bitc::CST_CODE_CE_CMP: { // CE_CMP: [opty, opval, opval, pred] if (Record.size() < 4) return Error("Invalid record"); Type *OpTy = getTypeByID(Record[0]); if (!OpTy) return Error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy); Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy); if (OpTy->isFPOrFPVectorTy()) V = ConstantExpr::getFCmp(Record[3], Op0, Op1); else V = ConstantExpr::getICmp(Record[3], Op0, Op1); break; } // This maintains backward compatibility, pre-asm dialect keywords. // FIXME: Remove with the 4.0 release. case bitc::CST_CODE_INLINEASM_OLD: { if (Record.size() < 2) return Error("Invalid record"); std::string AsmStr, ConstrStr; bool HasSideEffects = Record[0] & 1; bool IsAlignStack = Record[0] >> 1; unsigned AsmStrSize = Record[1]; if (2+AsmStrSize >= Record.size()) return Error("Invalid record"); unsigned ConstStrSize = Record[2+AsmStrSize]; if (3+AsmStrSize+ConstStrSize > Record.size()) return Error("Invalid record"); for (unsigned i = 0; i != AsmStrSize; ++i) AsmStr += (char)Record[2+i]; for (unsigned i = 0; i != ConstStrSize; ++i) ConstrStr += (char)Record[3+AsmStrSize+i]; PointerType *PTy = cast(CurTy); V = InlineAsm::get(cast(PTy->getElementType()), AsmStr, ConstrStr, HasSideEffects, IsAlignStack); break; } // This version adds support for the asm dialect keywords (e.g., // inteldialect). case bitc::CST_CODE_INLINEASM: { if (Record.size() < 2) return Error("Invalid record"); std::string AsmStr, ConstrStr; bool HasSideEffects = Record[0] & 1; bool IsAlignStack = (Record[0] >> 1) & 1; unsigned AsmDialect = Record[0] >> 2; unsigned AsmStrSize = Record[1]; if (2+AsmStrSize >= Record.size()) return Error("Invalid record"); unsigned ConstStrSize = Record[2+AsmStrSize]; if (3+AsmStrSize+ConstStrSize > Record.size()) return Error("Invalid record"); for (unsigned i = 0; i != AsmStrSize; ++i) AsmStr += (char)Record[2+i]; for (unsigned i = 0; i != ConstStrSize; ++i) ConstrStr += (char)Record[3+AsmStrSize+i]; PointerType *PTy = cast(CurTy); V = InlineAsm::get(cast(PTy->getElementType()), AsmStr, ConstrStr, HasSideEffects, IsAlignStack, InlineAsm::AsmDialect(AsmDialect)); break; } case bitc::CST_CODE_BLOCKADDRESS:{ if (Record.size() < 3) return Error("Invalid record"); Type *FnTy = getTypeByID(Record[0]); if (!FnTy) return Error("Invalid record"); Function *Fn = dyn_cast_or_null(ValueList.getConstantFwdRef(Record[1],FnTy)); if (!Fn) return Error("Invalid record"); // Don't let Fn get dematerialized. BlockAddressesTaken.insert(Fn); // If the function is already parsed we can insert the block address right // away. BasicBlock *BB; unsigned BBID = Record[2]; if (!BBID) // Invalid reference to entry block. return Error("Invalid ID"); if (!Fn->empty()) { Function::iterator BBI = Fn->begin(), BBE = Fn->end(); for (size_t I = 0, E = BBID; I != E; ++I) { if (BBI == BBE) return Error("Invalid ID"); ++BBI; } BB = BBI; } else { // Otherwise insert a placeholder and remember it so it can be inserted // when the function is parsed. auto &FwdBBs = BasicBlockFwdRefs[Fn]; if (FwdBBs.empty()) BasicBlockFwdRefQueue.push_back(Fn); if (FwdBBs.size() < BBID + 1) FwdBBs.resize(BBID + 1); if (!FwdBBs[BBID]) FwdBBs[BBID] = BasicBlock::Create(Context); BB = FwdBBs[BBID]; } V = BlockAddress::get(Fn, BB); break; } } ValueList.AssignValue(V, NextCstNo); ++NextCstNo; } } std::error_code BitcodeReader::ParseUseLists() { if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID)) return Error("Invalid record"); // Read all the records. SmallVector Record; while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a use list record. Record.clear(); bool IsBB = false; switch (Stream.readRecord(Entry.ID, Record)) { default: // Default behavior: unknown type. break; case bitc::USELIST_CODE_BB: IsBB = true; // fallthrough case bitc::USELIST_CODE_DEFAULT: { unsigned RecordLength = Record.size(); if (RecordLength < 3) // Records should have at least an ID and two indexes. return Error("Invalid record"); unsigned ID = Record.back(); Record.pop_back(); Value *V; if (IsBB) { assert(ID < FunctionBBs.size() && "Basic block not found"); V = FunctionBBs[ID]; } else V = ValueList[ID]; unsigned NumUses = 0; SmallDenseMap Order; for (const Use &U : V->uses()) { if (++NumUses > Record.size()) break; Order[&U] = Record[NumUses - 1]; } if (Order.size() != Record.size() || NumUses > Record.size()) // Mismatches can happen if the functions are being materialized lazily // (out-of-order), or a value has been upgraded. break; V->sortUseList([&](const Use &L, const Use &R) { return Order.lookup(&L) < Order.lookup(&R); }); break; } } } } /// RememberAndSkipFunctionBody - When we see the block for a function body, /// remember where it is and then skip it. This lets us lazily deserialize the /// functions. std::error_code BitcodeReader::RememberAndSkipFunctionBody() { // Get the function we are talking about. if (FunctionsWithBodies.empty()) return Error("Insufficient function protos"); Function *Fn = FunctionsWithBodies.back(); FunctionsWithBodies.pop_back(); // Save the current stream state. uint64_t CurBit = Stream.GetCurrentBitNo(); DeferredFunctionInfo[Fn] = CurBit; // Skip over the function block for now. if (Stream.SkipBlock()) return Error("Invalid record"); return std::error_code(); } std::error_code BitcodeReader::GlobalCleanup() { // Patch the initializers for globals and aliases up. ResolveGlobalAndAliasInits(); if (!GlobalInits.empty() || !AliasInits.empty()) return Error("Malformed global initializer set"); // Look for intrinsic functions which need to be upgraded at some point for (Module::iterator FI = TheModule->begin(), FE = TheModule->end(); FI != FE; ++FI) { Function *NewFn; if (UpgradeIntrinsicFunction(FI, NewFn)) UpgradedIntrinsics.push_back(std::make_pair(FI, NewFn)); } // Look for global variables which need to be renamed. for (Module::global_iterator GI = TheModule->global_begin(), GE = TheModule->global_end(); GI != GE;) { GlobalVariable *GV = GI++; UpgradeGlobalVariable(GV); } // Force deallocation of memory for these vectors to favor the client that // want lazy deserialization. std::vector >().swap(GlobalInits); std::vector >().swap(AliasInits); return std::error_code(); } std::error_code BitcodeReader::ParseModule(bool Resume) { if (Resume) Stream.JumpToBit(NextUnreadBit); else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID)) return Error("Invalid record"); SmallVector Record; std::vector SectionTable; std::vector GCTable; // Read all the records for this module. while (1) { BitstreamEntry Entry = Stream.advance(); switch (Entry.Kind) { case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return GlobalCleanup(); case BitstreamEntry::SubBlock: switch (Entry.ID) { default: // Skip unknown content. if (Stream.SkipBlock()) return Error("Invalid record"); break; case bitc::BLOCKINFO_BLOCK_ID: if (Stream.ReadBlockInfoBlock()) return Error("Malformed block"); break; case bitc::PARAMATTR_BLOCK_ID: if (std::error_code EC = ParseAttributeBlock()) return EC; break; case bitc::PARAMATTR_GROUP_BLOCK_ID: if (std::error_code EC = ParseAttributeGroupBlock()) return EC; break; case bitc::TYPE_BLOCK_ID_NEW: if (std::error_code EC = ParseTypeTable()) return EC; break; case bitc::VALUE_SYMTAB_BLOCK_ID: if (std::error_code EC = ParseValueSymbolTable()) return EC; SeenValueSymbolTable = true; break; case bitc::CONSTANTS_BLOCK_ID: if (std::error_code EC = ParseConstants()) return EC; if (std::error_code EC = ResolveGlobalAndAliasInits()) return EC; break; case bitc::METADATA_BLOCK_ID: if (std::error_code EC = ParseMetadata()) return EC; break; case bitc::FUNCTION_BLOCK_ID: // If this is the first function body we've seen, reverse the // FunctionsWithBodies list. if (!SeenFirstFunctionBody) { std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end()); if (std::error_code EC = GlobalCleanup()) return EC; SeenFirstFunctionBody = true; } if (std::error_code EC = RememberAndSkipFunctionBody()) return EC; // For streaming bitcode, suspend parsing when we reach the function // bodies. Subsequent materialization calls will resume it when // necessary. For streaming, the function bodies must be at the end of // the bitcode. If the bitcode file is old, the symbol table will be // at the end instead and will not have been seen yet. In this case, // just finish the parse now. if (LazyStreamer && SeenValueSymbolTable) { NextUnreadBit = Stream.GetCurrentBitNo(); return std::error_code(); } break; case bitc::USELIST_BLOCK_ID: if (std::error_code EC = ParseUseLists()) return EC; break; } continue; case BitstreamEntry::Record: // The interesting case. break; } // Read a record. switch (Stream.readRecord(Entry.ID, Record)) { default: break; // Default behavior, ignore unknown content. case bitc::MODULE_CODE_VERSION: { // VERSION: [version#] if (Record.size() < 1) return Error("Invalid record"); // Only version #0 and #1 are supported so far. unsigned module_version = Record[0]; switch (module_version) { default: return Error("Invalid value"); case 0: UseRelativeIDs = false; break; case 1: UseRelativeIDs = true; break; } break; } case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N] std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); TheModule->setTargetTriple(S); break; } case bitc::MODULE_CODE_DATALAYOUT: { // DATALAYOUT: [strchr x N] std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); TheModule->setDataLayout(S); break; } case bitc::MODULE_CODE_ASM: { // ASM: [strchr x N] std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); TheModule->setModuleInlineAsm(S); break; } case bitc::MODULE_CODE_DEPLIB: { // DEPLIB: [strchr x N] // FIXME: Remove in 4.0. std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); // Ignore value. break; } case bitc::MODULE_CODE_SECTIONNAME: { // SECTIONNAME: [strchr x N] std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); SectionTable.push_back(S); break; } case bitc::MODULE_CODE_GCNAME: { // SECTIONNAME: [strchr x N] std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); GCTable.push_back(S); break; } case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name] if (Record.size() < 2) return Error("Invalid record"); Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]); unsigned ComdatNameSize = Record[1]; std::string ComdatName; ComdatName.reserve(ComdatNameSize); for (unsigned i = 0; i != ComdatNameSize; ++i) ComdatName += (char)Record[2 + i]; Comdat *C = TheModule->getOrInsertComdat(ComdatName); C->setSelectionKind(SK); ComdatList.push_back(C); break; } // GLOBALVAR: [pointer type, isconst, initid, // linkage, alignment, section, visibility, threadlocal, // unnamed_addr, dllstorageclass] case bitc::MODULE_CODE_GLOBALVAR: { if (Record.size() < 6) return Error("Invalid record"); Type *Ty = getTypeByID(Record[0]); if (!Ty) return Error("Invalid record"); if (!Ty->isPointerTy()) return Error("Invalid type for value"); unsigned AddressSpace = cast(Ty)->getAddressSpace(); Ty = cast(Ty)->getElementType(); bool isConstant = Record[1]; GlobalValue::LinkageTypes Linkage = getDecodedLinkage(Record[3]); unsigned Alignment = (1 << Record[4]) >> 1; std::string Section; if (Record[5]) { if (Record[5]-1 >= SectionTable.size()) return Error("Invalid ID"); Section = SectionTable[Record[5]-1]; } GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility; // Local linkage must have default visibility. if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage)) // FIXME: Change to an error if non-default in 4.0. Visibility = GetDecodedVisibility(Record[6]); GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal; if (Record.size() > 7) TLM = GetDecodedThreadLocalMode(Record[7]); bool UnnamedAddr = false; if (Record.size() > 8) UnnamedAddr = Record[8]; bool ExternallyInitialized = false; if (Record.size() > 9) ExternallyInitialized = Record[9]; GlobalVariable *NewGV = new GlobalVariable(*TheModule, Ty, isConstant, Linkage, nullptr, "", nullptr, TLM, AddressSpace, ExternallyInitialized); NewGV->setAlignment(Alignment); if (!Section.empty()) NewGV->setSection(Section); NewGV->setVisibility(Visibility); NewGV->setUnnamedAddr(UnnamedAddr); if (Record.size() > 10) NewGV->setDLLStorageClass(GetDecodedDLLStorageClass(Record[10])); else UpgradeDLLImportExportLinkage(NewGV, Record[3]); ValueList.push_back(NewGV); // Remember which value to use for the global initializer. if (unsigned InitID = Record[2]) GlobalInits.push_back(std::make_pair(NewGV, InitID-1)); if (Record.size() > 11) if (unsigned ComdatID = Record[11]) { assert(ComdatID <= ComdatList.size()); NewGV->setComdat(ComdatList[ComdatID - 1]); } break; } // FUNCTION: [type, callingconv, isproto, linkage, paramattr, // alignment, section, visibility, gc, unnamed_addr, // prologuedata, dllstorageclass, comdat, prefixdata] case bitc::MODULE_CODE_FUNCTION: { if (Record.size() < 8) return Error("Invalid record"); Type *Ty = getTypeByID(Record[0]); if (!Ty) return Error("Invalid record"); if (!Ty->isPointerTy()) return Error("Invalid type for value"); FunctionType *FTy = dyn_cast(cast(Ty)->getElementType()); if (!FTy) return Error("Invalid type for value"); Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage, "", TheModule); Func->setCallingConv(static_cast(Record[1])); bool isProto = Record[2]; Func->setLinkage(getDecodedLinkage(Record[3])); Func->setAttributes(getAttributes(Record[4])); Func->setAlignment((1 << Record[5]) >> 1); if (Record[6]) { if (Record[6]-1 >= SectionTable.size()) return Error("Invalid ID"); Func->setSection(SectionTable[Record[6]-1]); } // Local linkage must have default visibility. if (!Func->hasLocalLinkage()) // FIXME: Change to an error if non-default in 4.0. Func->setVisibility(GetDecodedVisibility(Record[7])); if (Record.size() > 8 && Record[8]) { if (Record[8]-1 > GCTable.size()) return Error("Invalid ID"); Func->setGC(GCTable[Record[8]-1].c_str()); } bool UnnamedAddr = false; if (Record.size() > 9) UnnamedAddr = Record[9]; Func->setUnnamedAddr(UnnamedAddr); if (Record.size() > 10 && Record[10] != 0) FunctionPrologues.push_back(std::make_pair(Func, Record[10]-1)); if (Record.size() > 11) Func->setDLLStorageClass(GetDecodedDLLStorageClass(Record[11])); else UpgradeDLLImportExportLinkage(Func, Record[3]); if (Record.size() > 12) if (unsigned ComdatID = Record[12]) { assert(ComdatID <= ComdatList.size()); Func->setComdat(ComdatList[ComdatID - 1]); } if (Record.size() > 13 && Record[13] != 0) FunctionPrefixes.push_back(std::make_pair(Func, Record[13]-1)); ValueList.push_back(Func); // If this is a function with a body, remember the prototype we are // creating now, so that we can match up the body with them later. if (!isProto) { Func->setIsMaterializable(true); FunctionsWithBodies.push_back(Func); if (LazyStreamer) DeferredFunctionInfo[Func] = 0; } break; } // ALIAS: [alias type, aliasee val#, linkage] // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass] case bitc::MODULE_CODE_ALIAS: { if (Record.size() < 3) return Error("Invalid record"); Type *Ty = getTypeByID(Record[0]); if (!Ty) return Error("Invalid record"); auto *PTy = dyn_cast(Ty); if (!PTy) return Error("Invalid type for value"); auto *NewGA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), getDecodedLinkage(Record[2]), "", TheModule); // Old bitcode files didn't have visibility field. // Local linkage must have default visibility. if (Record.size() > 3 && !NewGA->hasLocalLinkage()) // FIXME: Change to an error if non-default in 4.0. NewGA->setVisibility(GetDecodedVisibility(Record[3])); if (Record.size() > 4) NewGA->setDLLStorageClass(GetDecodedDLLStorageClass(Record[4])); else UpgradeDLLImportExportLinkage(NewGA, Record[2]); if (Record.size() > 5) NewGA->setThreadLocalMode(GetDecodedThreadLocalMode(Record[5])); if (Record.size() > 6) NewGA->setUnnamedAddr(Record[6]); ValueList.push_back(NewGA); AliasInits.push_back(std::make_pair(NewGA, Record[1])); break; } /// MODULE_CODE_PURGEVALS: [numvals] case bitc::MODULE_CODE_PURGEVALS: // Trim down the value list to the specified size. if (Record.size() < 1 || Record[0] > ValueList.size()) return Error("Invalid record"); ValueList.shrinkTo(Record[0]); break; } Record.clear(); } } std::error_code BitcodeReader::ParseBitcodeInto(Module *M) { TheModule = nullptr; if (std::error_code EC = InitStream()) return EC; // Sniff for the signature. if (Stream.Read(8) != 'B' || Stream.Read(8) != 'C' || Stream.Read(4) != 0x0 || Stream.Read(4) != 0xC || Stream.Read(4) != 0xE || Stream.Read(4) != 0xD) return Error("Invalid bitcode signature"); // We expect a number of well-defined blocks, though we don't necessarily // need to understand them all. while (1) { if (Stream.AtEndOfStream()) return std::error_code(); BitstreamEntry Entry = Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs); switch (Entry.Kind) { case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::SubBlock: switch (Entry.ID) { case bitc::BLOCKINFO_BLOCK_ID: if (Stream.ReadBlockInfoBlock()) return Error("Malformed block"); break; case bitc::MODULE_BLOCK_ID: // Reject multiple MODULE_BLOCK's in a single bitstream. if (TheModule) return Error("Invalid multiple blocks"); TheModule = M; if (std::error_code EC = ParseModule(false)) return EC; if (LazyStreamer) return std::error_code(); break; default: if (Stream.SkipBlock()) return Error("Invalid record"); break; } continue; case BitstreamEntry::Record: // There should be no records in the top-level of blocks. // The ranlib in Xcode 4 will align archive members by appending newlines // to the end of them. If this file size is a multiple of 4 but not 8, we // have to read and ignore these final 4 bytes :-( if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 && Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a && Stream.AtEndOfStream()) return std::error_code(); return Error("Invalid record"); } } } ErrorOr BitcodeReader::parseModuleTriple() { if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID)) return Error("Invalid record"); SmallVector Record; std::string Triple; // Read all the records for this module. while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return Triple; case BitstreamEntry::Record: // The interesting case. break; } // Read a record. switch (Stream.readRecord(Entry.ID, Record)) { default: break; // Default behavior, ignore unknown content. case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N] std::string S; if (ConvertToString(Record, 0, S)) return Error("Invalid record"); Triple = S; break; } } Record.clear(); } llvm_unreachable("Exit infinite loop"); } ErrorOr BitcodeReader::parseTriple() { if (std::error_code EC = InitStream()) return EC; // Sniff for the signature. if (Stream.Read(8) != 'B' || Stream.Read(8) != 'C' || Stream.Read(4) != 0x0 || Stream.Read(4) != 0xC || Stream.Read(4) != 0xE || Stream.Read(4) != 0xD) return Error("Invalid bitcode signature"); // We expect a number of well-defined blocks, though we don't necessarily // need to understand them all. while (1) { BitstreamEntry Entry = Stream.advance(); switch (Entry.Kind) { case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::SubBlock: if (Entry.ID == bitc::MODULE_BLOCK_ID) return parseModuleTriple(); // Ignore other sub-blocks. if (Stream.SkipBlock()) return Error("Malformed block"); continue; case BitstreamEntry::Record: Stream.skipRecord(Entry.ID); continue; } } } /// ParseMetadataAttachment - Parse metadata attachments. std::error_code BitcodeReader::ParseMetadataAttachment() { if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID)) return Error("Invalid record"); SmallVector Record; while (1) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: return std::error_code(); case BitstreamEntry::Record: // The interesting case. break; } // Read a metadata attachment record. Record.clear(); switch (Stream.readRecord(Entry.ID, Record)) { default: // Default behavior: ignore. break; case bitc::METADATA_ATTACHMENT: { unsigned RecordLength = Record.size(); if (Record.empty() || (RecordLength - 1) % 2 == 1) return Error("Invalid record"); Instruction *Inst = InstructionList[Record[0]]; for (unsigned i = 1; i != RecordLength; i = i+2) { unsigned Kind = Record[i]; DenseMap::iterator I = MDKindMap.find(Kind); if (I == MDKindMap.end()) return Error("Invalid ID"); Metadata *Node = MDValueList.getValueFwdRef(Record[i + 1]); if (isa(Node)) // Drop the attachment. This used to be legal, but there's no // upgrade path. break; Inst->setMetadata(I->second, cast(Node)); if (I->second == LLVMContext::MD_tbaa) InstsWithTBAATag.push_back(Inst); } break; } } } } /// ParseFunctionBody - Lazily parse the specified function body block. std::error_code BitcodeReader::ParseFunctionBody(Function *F) { if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID)) return Error("Invalid record"); InstructionList.clear(); unsigned ModuleValueListSize = ValueList.size(); unsigned ModuleMDValueListSize = MDValueList.size(); // Add all the function arguments to the value table. for(Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) ValueList.push_back(I); unsigned NextValueNo = ValueList.size(); BasicBlock *CurBB = nullptr; unsigned CurBBNo = 0; DebugLoc LastLoc; auto getLastInstruction = [&]() -> Instruction * { if (CurBB && !CurBB->empty()) return &CurBB->back(); else if (CurBBNo && FunctionBBs[CurBBNo - 1] && !FunctionBBs[CurBBNo - 1]->empty()) return &FunctionBBs[CurBBNo - 1]->back(); return nullptr; }; // Read all the records. SmallVector Record; while (1) { BitstreamEntry Entry = Stream.advance(); switch (Entry.Kind) { case BitstreamEntry::Error: return Error("Malformed block"); case BitstreamEntry::EndBlock: goto OutOfRecordLoop; case BitstreamEntry::SubBlock: switch (Entry.ID) { default: // Skip unknown content. if (Stream.SkipBlock()) return Error("Invalid record"); break; case bitc::CONSTANTS_BLOCK_ID: if (std::error_code EC = ParseConstants()) return EC; NextValueNo = ValueList.size(); break; case bitc::VALUE_SYMTAB_BLOCK_ID: if (std::error_code EC = ParseValueSymbolTable()) return EC; break; case bitc::METADATA_ATTACHMENT_ID: if (std::error_code EC = ParseMetadataAttachment()) return EC; break; case bitc::METADATA_BLOCK_ID: if (std::error_code EC = ParseMetadata()) return EC; break; case bitc::USELIST_BLOCK_ID: if (std::error_code EC = ParseUseLists()) return EC; break; } continue; case BitstreamEntry::Record: // The interesting case. break; } // Read a record. Record.clear(); Instruction *I = nullptr; unsigned BitCode = Stream.readRecord(Entry.ID, Record); switch (BitCode) { default: // Default behavior: reject return Error("Invalid value"); case bitc::FUNC_CODE_DECLAREBLOCKS: { // DECLAREBLOCKS: [nblocks] if (Record.size() < 1 || Record[0] == 0) return Error("Invalid record"); // Create all the basic blocks for the function. FunctionBBs.resize(Record[0]); // See if anything took the address of blocks in this function. auto BBFRI = BasicBlockFwdRefs.find(F); if (BBFRI == BasicBlockFwdRefs.end()) { for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i) FunctionBBs[i] = BasicBlock::Create(Context, "", F); } else { auto &BBRefs = BBFRI->second; // Check for invalid basic block references. if (BBRefs.size() > FunctionBBs.size()) return Error("Invalid ID"); assert(!BBRefs.empty() && "Unexpected empty array"); assert(!BBRefs.front() && "Invalid reference to entry block"); for (unsigned I = 0, E = FunctionBBs.size(), RE = BBRefs.size(); I != E; ++I) if (I < RE && BBRefs[I]) { BBRefs[I]->insertInto(F); FunctionBBs[I] = BBRefs[I]; } else { FunctionBBs[I] = BasicBlock::Create(Context, "", F); } // Erase from the table. BasicBlockFwdRefs.erase(BBFRI); } CurBB = FunctionBBs[0]; continue; } case bitc::FUNC_CODE_DEBUG_LOC_AGAIN: // DEBUG_LOC_AGAIN // This record indicates that the last instruction is at the same // location as the previous instruction with a location. I = getLastInstruction(); if (!I) return Error("Invalid record"); I->setDebugLoc(LastLoc); I = nullptr; continue; case bitc::FUNC_CODE_DEBUG_LOC: { // DEBUG_LOC: [line, col, scope, ia] I = getLastInstruction(); if (!I || Record.size() < 4) return Error("Invalid record"); unsigned Line = Record[0], Col = Record[1]; unsigned ScopeID = Record[2], IAID = Record[3]; MDNode *Scope = nullptr, *IA = nullptr; if (ScopeID) Scope = cast(MDValueList.getValueFwdRef(ScopeID-1)); if (IAID) IA = cast(MDValueList.getValueFwdRef(IAID-1)); LastLoc = DebugLoc::get(Line, Col, Scope, IA); I->setDebugLoc(LastLoc); I = nullptr; continue; } case bitc::FUNC_CODE_INST_BINOP: { // BINOP: [opval, ty, opval, opcode] unsigned OpNum = 0; Value *LHS, *RHS; if (getValueTypePair(Record, OpNum, NextValueNo, LHS) || popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) || OpNum+1 > Record.size()) return Error("Invalid record"); int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType()); if (Opc == -1) return Error("Invalid record"); I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); InstructionList.push_back(I); if (OpNum < Record.size()) { if (Opc == Instruction::Add || Opc == Instruction::Sub || Opc == Instruction::Mul || Opc == Instruction::Shl) { if (Record[OpNum] & (1 << bitc::OBO_NO_SIGNED_WRAP)) cast(I)->setHasNoSignedWrap(true); if (Record[OpNum] & (1 << bitc::OBO_NO_UNSIGNED_WRAP)) cast(I)->setHasNoUnsignedWrap(true); } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv || Opc == Instruction::LShr || Opc == Instruction::AShr) { if (Record[OpNum] & (1 << bitc::PEO_EXACT)) cast(I)->setIsExact(true); } else if (isa(I)) { FastMathFlags FMF; if (0 != (Record[OpNum] & FastMathFlags::UnsafeAlgebra)) FMF.setUnsafeAlgebra(); if (0 != (Record[OpNum] & FastMathFlags::NoNaNs)) FMF.setNoNaNs(); if (0 != (Record[OpNum] & FastMathFlags::NoInfs)) FMF.setNoInfs(); if (0 != (Record[OpNum] & FastMathFlags::NoSignedZeros)) FMF.setNoSignedZeros(); if (0 != (Record[OpNum] & FastMathFlags::AllowReciprocal)) FMF.setAllowReciprocal(); if (FMF.any()) I->setFastMathFlags(FMF); } } break; } case bitc::FUNC_CODE_INST_CAST: { // CAST: [opval, opty, destty, castopc] unsigned OpNum = 0; Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op) || OpNum+2 != Record.size()) return Error("Invalid record"); Type *ResTy = getTypeByID(Record[OpNum]); int Opc = GetDecodedCastOpcode(Record[OpNum+1]); if (Opc == -1 || !ResTy) return Error("Invalid record"); Instruction *Temp = nullptr; if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) { if (Temp) { InstructionList.push_back(Temp); CurBB->getInstList().push_back(Temp); } } else { I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy); } InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_INBOUNDS_GEP: case bitc::FUNC_CODE_INST_GEP: { // GEP: [n x operands] unsigned OpNum = 0; Value *BasePtr; if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr)) return Error("Invalid record"); SmallVector GEPIdx; while (OpNum != Record.size()) { Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op)) return Error("Invalid record"); GEPIdx.push_back(Op); } I = GetElementPtrInst::Create(BasePtr, GEPIdx); InstructionList.push_back(I); if (BitCode == bitc::FUNC_CODE_INST_INBOUNDS_GEP) cast(I)->setIsInBounds(true); break; } case bitc::FUNC_CODE_INST_EXTRACTVAL: { // EXTRACTVAL: [opty, opval, n x indices] unsigned OpNum = 0; Value *Agg; if (getValueTypePair(Record, OpNum, NextValueNo, Agg)) return Error("Invalid record"); SmallVector EXTRACTVALIdx; for (unsigned RecSize = Record.size(); OpNum != RecSize; ++OpNum) { uint64_t Index = Record[OpNum]; if ((unsigned)Index != Index) return Error("Invalid value"); EXTRACTVALIdx.push_back((unsigned)Index); } I = ExtractValueInst::Create(Agg, EXTRACTVALIdx); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_INSERTVAL: { // INSERTVAL: [opty, opval, opty, opval, n x indices] unsigned OpNum = 0; Value *Agg; if (getValueTypePair(Record, OpNum, NextValueNo, Agg)) return Error("Invalid record"); Value *Val; if (getValueTypePair(Record, OpNum, NextValueNo, Val)) return Error("Invalid record"); SmallVector INSERTVALIdx; for (unsigned RecSize = Record.size(); OpNum != RecSize; ++OpNum) { uint64_t Index = Record[OpNum]; if ((unsigned)Index != Index) return Error("Invalid value"); INSERTVALIdx.push_back((unsigned)Index); } I = InsertValueInst::Create(Agg, Val, INSERTVALIdx); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_SELECT: { // SELECT: [opval, ty, opval, opval] // obsolete form of select // handles select i1 ... in old bitcode unsigned OpNum = 0; Value *TrueVal, *FalseVal, *Cond; if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) || popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) || popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond)) return Error("Invalid record"); I = SelectInst::Create(Cond, TrueVal, FalseVal); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_VSELECT: {// VSELECT: [ty,opval,opval,predty,pred] // new form of select // handles select i1 or select [N x i1] unsigned OpNum = 0; Value *TrueVal, *FalseVal, *Cond; if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) || popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) || getValueTypePair(Record, OpNum, NextValueNo, Cond)) return Error("Invalid record"); // select condition can be either i1 or [N x i1] if (VectorType* vector_type = dyn_cast(Cond->getType())) { // expect if (vector_type->getElementType() != Type::getInt1Ty(Context)) return Error("Invalid type for value"); } else { // expect i1 if (Cond->getType() != Type::getInt1Ty(Context)) return Error("Invalid type for value"); } I = SelectInst::Create(Cond, TrueVal, FalseVal); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_EXTRACTELT: { // EXTRACTELT: [opty, opval, opval] unsigned OpNum = 0; Value *Vec, *Idx; if (getValueTypePair(Record, OpNum, NextValueNo, Vec) || getValueTypePair(Record, OpNum, NextValueNo, Idx)) return Error("Invalid record"); I = ExtractElementInst::Create(Vec, Idx); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_INSERTELT: { // INSERTELT: [ty, opval,opval,opval] unsigned OpNum = 0; Value *Vec, *Elt, *Idx; if (getValueTypePair(Record, OpNum, NextValueNo, Vec) || popValue(Record, OpNum, NextValueNo, cast(Vec->getType())->getElementType(), Elt) || getValueTypePair(Record, OpNum, NextValueNo, Idx)) return Error("Invalid record"); I = InsertElementInst::Create(Vec, Elt, Idx); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval] unsigned OpNum = 0; Value *Vec1, *Vec2, *Mask; if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) || popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2)) return Error("Invalid record"); if (getValueTypePair(Record, OpNum, NextValueNo, Mask)) return Error("Invalid record"); I = new ShuffleVectorInst(Vec1, Vec2, Mask); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_CMP: // CMP: [opty, opval, opval, pred] // Old form of ICmp/FCmp returning bool // Existed to differentiate between icmp/fcmp and vicmp/vfcmp which were // both legal on vectors but had different behaviour. case bitc::FUNC_CODE_INST_CMP2: { // CMP2: [opty, opval, opval, pred] // FCmp/ICmp returning bool or vector of bool unsigned OpNum = 0; Value *LHS, *RHS; if (getValueTypePair(Record, OpNum, NextValueNo, LHS) || popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) || OpNum+1 != Record.size()) return Error("Invalid record"); if (LHS->getType()->isFPOrFPVectorTy()) I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS); else I = new ICmpInst((ICmpInst::Predicate)Record[OpNum], LHS, RHS); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_RET: // RET: [opty,opval] { unsigned Size = Record.size(); if (Size == 0) { I = ReturnInst::Create(Context); InstructionList.push_back(I); break; } unsigned OpNum = 0; Value *Op = nullptr; if (getValueTypePair(Record, OpNum, NextValueNo, Op)) return Error("Invalid record"); if (OpNum != Record.size()) return Error("Invalid record"); I = ReturnInst::Create(Context, Op); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#] if (Record.size() != 1 && Record.size() != 3) return Error("Invalid record"); BasicBlock *TrueDest = getBasicBlock(Record[0]); if (!TrueDest) return Error("Invalid record"); if (Record.size() == 1) { I = BranchInst::Create(TrueDest); InstructionList.push_back(I); } else { BasicBlock *FalseDest = getBasicBlock(Record[1]); Value *Cond = getValue(Record, 2, NextValueNo, Type::getInt1Ty(Context)); if (!FalseDest || !Cond) return Error("Invalid record"); I = BranchInst::Create(TrueDest, FalseDest, Cond); InstructionList.push_back(I); } break; } case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...] // Check magic if ((Record[0] >> 16) == SWITCH_INST_MAGIC) { // "New" SwitchInst format with case ranges. The changes to write this // format were reverted but we still recognize bitcode that uses it. // Hopefully someday we will have support for case ranges and can use // this format again. Type *OpTy = getTypeByID(Record[1]); unsigned ValueBitWidth = cast(OpTy)->getBitWidth(); Value *Cond = getValue(Record, 2, NextValueNo, OpTy); BasicBlock *Default = getBasicBlock(Record[3]); if (!OpTy || !Cond || !Default) return Error("Invalid record"); unsigned NumCases = Record[4]; SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases); InstructionList.push_back(SI); unsigned CurIdx = 5; for (unsigned i = 0; i != NumCases; ++i) { SmallVector CaseVals; unsigned NumItems = Record[CurIdx++]; for (unsigned ci = 0; ci != NumItems; ++ci) { bool isSingleNumber = Record[CurIdx++]; APInt Low; unsigned ActiveWords = 1; if (ValueBitWidth > 64) ActiveWords = Record[CurIdx++]; Low = ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords), ValueBitWidth); CurIdx += ActiveWords; if (!isSingleNumber) { ActiveWords = 1; if (ValueBitWidth > 64) ActiveWords = Record[CurIdx++]; APInt High = ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords), ValueBitWidth); CurIdx += ActiveWords; // FIXME: It is not clear whether values in the range should be // compared as signed or unsigned values. The partially // implemented changes that used this format in the past used // unsigned comparisons. for ( ; Low.ule(High); ++Low) CaseVals.push_back(ConstantInt::get(Context, Low)); } else CaseVals.push_back(ConstantInt::get(Context, Low)); } BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]); for (SmallVector::iterator cvi = CaseVals.begin(), cve = CaseVals.end(); cvi != cve; ++cvi) SI->addCase(*cvi, DestBB); } I = SI; break; } // Old SwitchInst format without case ranges. if (Record.size() < 3 || (Record.size() & 1) == 0) return Error("Invalid record"); Type *OpTy = getTypeByID(Record[0]); Value *Cond = getValue(Record, 1, NextValueNo, OpTy); BasicBlock *Default = getBasicBlock(Record[2]); if (!OpTy || !Cond || !Default) return Error("Invalid record"); unsigned NumCases = (Record.size()-3)/2; SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases); InstructionList.push_back(SI); for (unsigned i = 0, e = NumCases; i != e; ++i) { ConstantInt *CaseVal = dyn_cast_or_null(getFnValueByID(Record[3+i*2], OpTy)); BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]); if (!CaseVal || !DestBB) { delete SI; return Error("Invalid record"); } SI->addCase(CaseVal, DestBB); } I = SI; break; } case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...] if (Record.size() < 2) return Error("Invalid record"); Type *OpTy = getTypeByID(Record[0]); Value *Address = getValue(Record, 1, NextValueNo, OpTy); if (!OpTy || !Address) return Error("Invalid record"); unsigned NumDests = Record.size()-2; IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests); InstructionList.push_back(IBI); for (unsigned i = 0, e = NumDests; i != e; ++i) { if (BasicBlock *DestBB = getBasicBlock(Record[2+i])) { IBI->addDestination(DestBB); } else { delete IBI; return Error("Invalid record"); } } I = IBI; break; } case bitc::FUNC_CODE_INST_INVOKE: { // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...] if (Record.size() < 4) return Error("Invalid record"); AttributeSet PAL = getAttributes(Record[0]); unsigned CCInfo = Record[1]; BasicBlock *NormalBB = getBasicBlock(Record[2]); BasicBlock *UnwindBB = getBasicBlock(Record[3]); unsigned OpNum = 4; Value *Callee; if (getValueTypePair(Record, OpNum, NextValueNo, Callee)) return Error("Invalid record"); PointerType *CalleeTy = dyn_cast(Callee->getType()); FunctionType *FTy = !CalleeTy ? nullptr : dyn_cast(CalleeTy->getElementType()); // Check that the right number of fixed parameters are here. if (!FTy || !NormalBB || !UnwindBB || Record.size() < OpNum+FTy->getNumParams()) return Error("Invalid record"); SmallVector Ops; for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) { Ops.push_back(getValue(Record, OpNum, NextValueNo, FTy->getParamType(i))); if (!Ops.back()) return Error("Invalid record"); } if (!FTy->isVarArg()) { if (Record.size() != OpNum) return Error("Invalid record"); } else { // Read type/value pairs for varargs params. while (OpNum != Record.size()) { Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op)) return Error("Invalid record"); Ops.push_back(Op); } } I = InvokeInst::Create(Callee, NormalBB, UnwindBB, Ops); InstructionList.push_back(I); cast(I)->setCallingConv( static_cast(CCInfo)); cast(I)->setAttributes(PAL); break; } case bitc::FUNC_CODE_INST_RESUME: { // RESUME: [opval] unsigned Idx = 0; Value *Val = nullptr; if (getValueTypePair(Record, Idx, NextValueNo, Val)) return Error("Invalid record"); I = ResumeInst::Create(Val); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_UNREACHABLE: // UNREACHABLE I = new UnreachableInst(Context); InstructionList.push_back(I); break; case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...] if (Record.size() < 1 || ((Record.size()-1)&1)) return Error("Invalid record"); Type *Ty = getTypeByID(Record[0]); if (!Ty) return Error("Invalid record"); PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2); InstructionList.push_back(PN); for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) { Value *V; // With the new function encoding, it is possible that operands have // negative IDs (for forward references). Use a signed VBR // representation to keep the encoding small. if (UseRelativeIDs) V = getValueSigned(Record, 1+i, NextValueNo, Ty); else V = getValue(Record, 1+i, NextValueNo, Ty); BasicBlock *BB = getBasicBlock(Record[2+i]); if (!V || !BB) return Error("Invalid record"); PN->addIncoming(V, BB); } I = PN; break; } case bitc::FUNC_CODE_INST_LANDINGPAD: { // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?] unsigned Idx = 0; if (Record.size() < 4) return Error("Invalid record"); Type *Ty = getTypeByID(Record[Idx++]); if (!Ty) return Error("Invalid record"); Value *PersFn = nullptr; if (getValueTypePair(Record, Idx, NextValueNo, PersFn)) return Error("Invalid record"); bool IsCleanup = !!Record[Idx++]; unsigned NumClauses = Record[Idx++]; LandingPadInst *LP = LandingPadInst::Create(Ty, PersFn, NumClauses); LP->setCleanup(IsCleanup); for (unsigned J = 0; J != NumClauses; ++J) { LandingPadInst::ClauseType CT = LandingPadInst::ClauseType(Record[Idx++]); (void)CT; Value *Val; if (getValueTypePair(Record, Idx, NextValueNo, Val)) { delete LP; return Error("Invalid record"); } assert((CT != LandingPadInst::Catch || !isa(Val->getType())) && "Catch clause has a invalid type!"); assert((CT != LandingPadInst::Filter || isa(Val->getType())) && "Filter clause has invalid type!"); LP->addClause(cast(Val)); } I = LP; InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align] if (Record.size() != 4) return Error("Invalid record"); PointerType *Ty = dyn_cast_or_null(getTypeByID(Record[0])); Type *OpTy = getTypeByID(Record[1]); Value *Size = getFnValueByID(Record[2], OpTy); unsigned AlignRecord = Record[3]; bool InAlloca = AlignRecord & (1 << 5); unsigned Align = AlignRecord & ((1 << 5) - 1); if (!Ty || !Size) return Error("Invalid record"); AllocaInst *AI = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1); AI->setUsedWithInAlloca(InAlloca); I = AI; InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_LOAD: { // LOAD: [opty, op, align, vol] unsigned OpNum = 0; Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op) || OpNum+2 != Record.size()) return Error("Invalid record"); I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_LOADATOMIC: { // LOADATOMIC: [opty, op, align, vol, ordering, synchscope] unsigned OpNum = 0; Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op) || OpNum+4 != Record.size()) return Error("Invalid record"); AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]); if (Ordering == NotAtomic || Ordering == Release || Ordering == AcquireRelease) return Error("Invalid record"); if (Ordering != NotAtomic && Record[OpNum] == 0) return Error("Invalid record"); SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]); I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1, Ordering, SynchScope); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol] unsigned OpNum = 0; Value *Val, *Ptr; if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) || popValue(Record, OpNum, NextValueNo, cast(Ptr->getType())->getElementType(), Val) || OpNum+2 != Record.size()) return Error("Invalid record"); I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_STOREATOMIC: { // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, synchscope] unsigned OpNum = 0; Value *Val, *Ptr; if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) || popValue(Record, OpNum, NextValueNo, cast(Ptr->getType())->getElementType(), Val) || OpNum+4 != Record.size()) return Error("Invalid record"); AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]); if (Ordering == NotAtomic || Ordering == Acquire || Ordering == AcquireRelease) return Error("Invalid record"); SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]); if (Ordering != NotAtomic && Record[OpNum] == 0) return Error("Invalid record"); I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1, Ordering, SynchScope); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_CMPXCHG: { // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, synchscope, // failureordering?, isweak?] unsigned OpNum = 0; Value *Ptr, *Cmp, *New; if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) || popValue(Record, OpNum, NextValueNo, cast(Ptr->getType())->getElementType(), Cmp) || popValue(Record, OpNum, NextValueNo, cast(Ptr->getType())->getElementType(), New) || (Record.size() < OpNum + 3 || Record.size() > OpNum + 5)) return Error("Invalid record"); AtomicOrdering SuccessOrdering = GetDecodedOrdering(Record[OpNum+1]); if (SuccessOrdering == NotAtomic || SuccessOrdering == Unordered) return Error("Invalid record"); SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]); AtomicOrdering FailureOrdering; if (Record.size() < 7) FailureOrdering = AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering); else FailureOrdering = GetDecodedOrdering(Record[OpNum+3]); I = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SynchScope); cast(I)->setVolatile(Record[OpNum]); if (Record.size() < 8) { // Before weak cmpxchgs existed, the instruction simply returned the // value loaded from memory, so bitcode files from that era will be // expecting the first component of a modern cmpxchg. CurBB->getInstList().push_back(I); I = ExtractValueInst::Create(I, 0); } else { cast(I)->setWeak(Record[OpNum+4]); } InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_ATOMICRMW: { // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, synchscope] unsigned OpNum = 0; Value *Ptr, *Val; if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) || popValue(Record, OpNum, NextValueNo, cast(Ptr->getType())->getElementType(), Val) || OpNum+4 != Record.size()) return Error("Invalid record"); AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]); if (Operation < AtomicRMWInst::FIRST_BINOP || Operation > AtomicRMWInst::LAST_BINOP) return Error("Invalid record"); AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]); if (Ordering == NotAtomic || Ordering == Unordered) return Error("Invalid record"); SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]); I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope); cast(I)->setVolatile(Record[OpNum+1]); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope] if (2 != Record.size()) return Error("Invalid record"); AtomicOrdering Ordering = GetDecodedOrdering(Record[0]); if (Ordering == NotAtomic || Ordering == Unordered || Ordering == Monotonic) return Error("Invalid record"); SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]); I = new FenceInst(Context, Ordering, SynchScope); InstructionList.push_back(I); break; } case bitc::FUNC_CODE_INST_CALL: { // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...] if (Record.size() < 3) return Error("Invalid record"); AttributeSet PAL = getAttributes(Record[0]); unsigned CCInfo = Record[1]; unsigned OpNum = 2; Value *Callee; if (getValueTypePair(Record, OpNum, NextValueNo, Callee)) return Error("Invalid record"); PointerType *OpTy = dyn_cast(Callee->getType()); FunctionType *FTy = nullptr; if (OpTy) FTy = dyn_cast(OpTy->getElementType()); if (!FTy || Record.size() < FTy->getNumParams()+OpNum) return Error("Invalid record"); SmallVector Args; // Read the fixed params. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) { if (FTy->getParamType(i)->isLabelTy()) Args.push_back(getBasicBlock(Record[OpNum])); else Args.push_back(getValue(Record, OpNum, NextValueNo, FTy->getParamType(i))); if (!Args.back()) return Error("Invalid record"); } // Read type/value pairs for varargs params. if (!FTy->isVarArg()) { if (OpNum != Record.size()) return Error("Invalid record"); } else { while (OpNum != Record.size()) { Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op)) return Error("Invalid record"); Args.push_back(Op); } } I = CallInst::Create(Callee, Args); InstructionList.push_back(I); cast(I)->setCallingConv( static_cast((~(1U << 14) & CCInfo) >> 1)); CallInst::TailCallKind TCK = CallInst::TCK_None; if (CCInfo & 1) TCK = CallInst::TCK_Tail; if (CCInfo & (1 << 14)) TCK = CallInst::TCK_MustTail; cast(I)->setTailCallKind(TCK); cast(I)->setAttributes(PAL); break; } case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty] if (Record.size() < 3) return Error("Invalid record"); Type *OpTy = getTypeByID(Record[0]); Value *Op = getValue(Record, 1, NextValueNo, OpTy); Type *ResTy = getTypeByID(Record[2]); if (!OpTy || !Op || !ResTy) return Error("Invalid record"); I = new VAArgInst(Op, ResTy); InstructionList.push_back(I); break; } } // Add instruction to end of current BB. If there is no current BB, reject // this file. if (!CurBB) { delete I; return Error("Invalid instruction with no BB"); } CurBB->getInstList().push_back(I); // If this was a terminator instruction, move to the next block. if (isa(I)) { ++CurBBNo; CurBB = CurBBNo < FunctionBBs.size() ? FunctionBBs[CurBBNo] : nullptr; } // Non-void values get registered in the value table for future use. if (I && !I->getType()->isVoidTy()) ValueList.AssignValue(I, NextValueNo++); } OutOfRecordLoop: // Check the function list for unresolved values. if (Argument *A = dyn_cast(ValueList.back())) { if (!A->getParent()) { // We found at least one unresolved value. Nuke them all to avoid leaks. for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){ if ((A = dyn_cast_or_null(ValueList[i])) && !A->getParent()) { A->replaceAllUsesWith(UndefValue::get(A->getType())); delete A; } } return Error("Never resolved value found in function"); } } // FIXME: Check for unresolved forward-declared metadata references // and clean up leaks. // Trim the value list down to the size it was before we parsed this function. ValueList.shrinkTo(ModuleValueListSize); MDValueList.shrinkTo(ModuleMDValueListSize); std::vector().swap(FunctionBBs); return std::error_code(); } /// Find the function body in the bitcode stream std::error_code BitcodeReader::FindFunctionInStream( Function *F, DenseMap::iterator DeferredFunctionInfoIterator) { while (DeferredFunctionInfoIterator->second == 0) { if (Stream.AtEndOfStream()) return Error("Could not find function in stream"); // ParseModule will parse the next body in the stream and set its // position in the DeferredFunctionInfo map. if (std::error_code EC = ParseModule(true)) return EC; } return std::error_code(); } //===----------------------------------------------------------------------===// // GVMaterializer implementation //===----------------------------------------------------------------------===// void BitcodeReader::releaseBuffer() { Buffer.release(); } std::error_code BitcodeReader::materialize(GlobalValue *GV) { Function *F = dyn_cast(GV); // If it's not a function or is already material, ignore the request. if (!F || !F->isMaterializable()) return std::error_code(); DenseMap::iterator DFII = DeferredFunctionInfo.find(F); assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!"); // If its position is recorded as 0, its body is somewhere in the stream // but we haven't seen it yet. if (DFII->second == 0 && LazyStreamer) if (std::error_code EC = FindFunctionInStream(F, DFII)) return EC; // Move the bit stream to the saved position of the deferred function body. Stream.JumpToBit(DFII->second); if (std::error_code EC = ParseFunctionBody(F)) return EC; F->setIsMaterializable(false); // Upgrade any old intrinsic calls in the function. for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(), E = UpgradedIntrinsics.end(); I != E; ++I) { if (I->first != I->second) { for (auto UI = I->first->user_begin(), UE = I->first->user_end(); UI != UE;) { if (CallInst* CI = dyn_cast(*UI++)) UpgradeIntrinsicCall(CI, I->second); } } } // Bring in any functions that this function forward-referenced via // blockaddresses. return materializeForwardReferencedFunctions(); } bool BitcodeReader::isDematerializable(const GlobalValue *GV) const { const Function *F = dyn_cast(GV); if (!F || F->isDeclaration()) return false; // Dematerializing F would leave dangling references that wouldn't be // reconnected on re-materialization. if (BlockAddressesTaken.count(F)) return false; return DeferredFunctionInfo.count(const_cast(F)); } void BitcodeReader::Dematerialize(GlobalValue *GV) { Function *F = dyn_cast(GV); // If this function isn't dematerializable, this is a noop. if (!F || !isDematerializable(F)) return; assert(DeferredFunctionInfo.count(F) && "No info to read function later?"); // Just forget the function body, we can remat it later. F->dropAllReferences(); F->setIsMaterializable(true); } std::error_code BitcodeReader::MaterializeModule(Module *M) { assert(M == TheModule && "Can only Materialize the Module this BitcodeReader is attached to."); // Promise to materialize all forward references. WillMaterializeAllForwardRefs = true; // Iterate over the module, deserializing any functions that are still on // disk. for (Module::iterator F = TheModule->begin(), E = TheModule->end(); F != E; ++F) { if (std::error_code EC = materialize(F)) return EC; } // At this point, if there are any function bodies, the current bit is // pointing to the END_BLOCK record after them. Now make sure the rest // of the bits in the module have been read. if (NextUnreadBit) ParseModule(true); // Check that all block address forward references got resolved (as we // promised above). if (!BasicBlockFwdRefs.empty()) return Error("Never resolved function from blockaddress"); // Upgrade any intrinsic calls that slipped through (should not happen!) and // delete the old functions to clean up. We can't do this unless the entire // module is materialized because there could always be another function body // with calls to the old function. for (std::vector >::iterator I = UpgradedIntrinsics.begin(), E = UpgradedIntrinsics.end(); I != E; ++I) { if (I->first != I->second) { for (auto UI = I->first->user_begin(), UE = I->first->user_end(); UI != UE;) { if (CallInst* CI = dyn_cast(*UI++)) UpgradeIntrinsicCall(CI, I->second); } if (!I->first->use_empty()) I->first->replaceAllUsesWith(I->second); I->first->eraseFromParent(); } } std::vector >().swap(UpgradedIntrinsics); for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++) UpgradeInstWithTBAATag(InstsWithTBAATag[I]); UpgradeDebugInfo(*M); return std::error_code(); } std::vector BitcodeReader::getIdentifiedStructTypes() const { return IdentifiedStructTypes; } std::error_code BitcodeReader::InitStream() { if (LazyStreamer) return InitLazyStream(); return InitStreamFromBuffer(); } std::error_code BitcodeReader::InitStreamFromBuffer() { const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart(); const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize(); if (Buffer->getBufferSize() & 3) return Error("Invalid bitcode signature"); // If we have a wrapper header, parse it and ignore the non-bc file contents. // The magic number is 0x0B17C0DE stored in little endian. if (isBitcodeWrapper(BufPtr, BufEnd)) if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true)) return Error("Invalid bitcode wrapper header"); StreamFile.reset(new BitstreamReader(BufPtr, BufEnd)); Stream.init(&*StreamFile); return std::error_code(); } std::error_code BitcodeReader::InitLazyStream() { // Check and strip off the bitcode wrapper; BitstreamReader expects never to // see it. auto OwnedBytes = llvm::make_unique(LazyStreamer); StreamingMemoryObject &Bytes = *OwnedBytes; StreamFile = llvm::make_unique(std::move(OwnedBytes)); Stream.init(&*StreamFile); unsigned char buf[16]; if (Bytes.readBytes(buf, 16, 0) != 16) return Error("Invalid bitcode signature"); if (!isBitcode(buf, buf + 16)) return Error("Invalid bitcode signature"); if (isBitcodeWrapper(buf, buf + 4)) { const unsigned char *bitcodeStart = buf; const unsigned char *bitcodeEnd = buf + 16; SkipBitcodeWrapperHeader(bitcodeStart, bitcodeEnd, false); Bytes.dropLeadingBytes(bitcodeStart - buf); Bytes.setKnownObjectSize(bitcodeEnd - bitcodeStart); } return std::error_code(); } namespace { class BitcodeErrorCategoryType : public std::error_category { const char *name() const LLVM_NOEXCEPT override { return "llvm.bitcode"; } std::string message(int IE) const override { BitcodeError E = static_cast(IE); switch (E) { case BitcodeError::InvalidBitcodeSignature: return "Invalid bitcode signature"; case BitcodeError::CorruptedBitcode: return "Corrupted bitcode"; } llvm_unreachable("Unknown error type!"); } }; } static ManagedStatic ErrorCategory; const std::error_category &llvm::BitcodeErrorCategory() { return *ErrorCategory; } //===----------------------------------------------------------------------===// // External interface //===----------------------------------------------------------------------===// /// \brief Get a lazy one-at-time loading module from bitcode. /// /// This isn't always used in a lazy context. In particular, it's also used by /// \a parseBitcodeFile(). If this is truly lazy, then we need to eagerly pull /// in forward-referenced functions from block address references. /// /// \param[in] WillMaterializeAll Set to \c true if the caller promises to /// materialize everything -- in particular, if this isn't truly lazy. static ErrorOr getLazyBitcodeModuleImpl(std::unique_ptr &&Buffer, LLVMContext &Context, bool WillMaterializeAll, DiagnosticHandlerFunction DiagnosticHandler) { Module *M = new Module(Buffer->getBufferIdentifier(), Context); BitcodeReader *R = new BitcodeReader(Buffer.get(), Context, DiagnosticHandler); M->setMaterializer(R); auto cleanupOnError = [&](std::error_code EC) { R->releaseBuffer(); // Never take ownership on error. delete M; // Also deletes R. return EC; }; if (std::error_code EC = R->ParseBitcodeInto(M)) return cleanupOnError(EC); if (!WillMaterializeAll) // Resolve forward references from blockaddresses. if (std::error_code EC = R->materializeForwardReferencedFunctions()) return cleanupOnError(EC); Buffer.release(); // The BitcodeReader owns it now. return M; } ErrorOr llvm::getLazyBitcodeModule(std::unique_ptr &&Buffer, LLVMContext &Context, DiagnosticHandlerFunction DiagnosticHandler) { return getLazyBitcodeModuleImpl(std::move(Buffer), Context, false, DiagnosticHandler); } ErrorOr> llvm::getStreamedBitcodeModule(StringRef Name, DataStreamer *Streamer, LLVMContext &Context, DiagnosticHandlerFunction DiagnosticHandler) { std::unique_ptr M = make_unique(Name, Context); BitcodeReader *R = new BitcodeReader(Streamer, Context, DiagnosticHandler); M->setMaterializer(R); if (std::error_code EC = R->ParseBitcodeInto(M.get())) return EC; return std::move(M); } ErrorOr llvm::parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, DiagnosticHandlerFunction DiagnosticHandler) { std::unique_ptr Buf = MemoryBuffer::getMemBuffer(Buffer, false); ErrorOr ModuleOrErr = getLazyBitcodeModuleImpl( std::move(Buf), Context, true, DiagnosticHandler); if (!ModuleOrErr) return ModuleOrErr; Module *M = ModuleOrErr.get(); // Read in the entire module, and destroy the BitcodeReader. if (std::error_code EC = M->materializeAllPermanently()) { delete M; return EC; } // TODO: Restore the use-lists to the in-memory state when the bitcode was // written. We must defer until the Module has been fully materialized. return M; } std::string llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer, LLVMContext &Context, DiagnosticHandlerFunction DiagnosticHandler) { std::unique_ptr Buf = MemoryBuffer::getMemBuffer(Buffer, false); auto R = llvm::make_unique(Buf.release(), Context, DiagnosticHandler); ErrorOr Triple = R->parseTriple(); if (Triple.getError()) return ""; return Triple.get(); } Index: projects/clang360-import/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.h =================================================================== --- projects/clang360-import/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.h (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.h (revision 279025) @@ -1,363 +1,365 @@ //===- BitcodeReader.h - Internal BitcodeReader impl ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This header defines the BitcodeReader class. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_BITCODE_READER_BITCODEREADER_H #define LLVM_LIB_BITCODE_READER_BITCODEREADER_H #include "llvm/ADT/DenseMap.h" #include "llvm/Bitcode/BitstreamReader.h" #include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/GVMaterializer.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/TrackingMDRef.h" #include "llvm/IR/Type.h" #include "llvm/IR/ValueHandle.h" #include #include #include namespace llvm { class Comdat; class MemoryBuffer; class LLVMContext; //===----------------------------------------------------------------------===// // BitcodeReaderValueList Class //===----------------------------------------------------------------------===// class BitcodeReaderValueList { std::vector ValuePtrs; /// ResolveConstants - As we resolve forward-referenced constants, we add /// information about them to this vector. This allows us to resolve them in /// bulk instead of resolving each reference at a time. See the code in /// ResolveConstantForwardRefs for more information about this. /// /// The key of this vector is the placeholder constant, the value is the slot /// number that holds the resolved value. typedef std::vector > ResolveConstantsTy; ResolveConstantsTy ResolveConstants; LLVMContext &Context; public: BitcodeReaderValueList(LLVMContext &C) : Context(C) {} ~BitcodeReaderValueList() { assert(ResolveConstants.empty() && "Constants not resolved?"); } // vector compatibility methods unsigned size() const { return ValuePtrs.size(); } void resize(unsigned N) { ValuePtrs.resize(N); } void push_back(Value *V) { ValuePtrs.push_back(V); } void clear() { assert(ResolveConstants.empty() && "Constants not resolved?"); ValuePtrs.clear(); } Value *operator[](unsigned i) const { assert(i < ValuePtrs.size()); return ValuePtrs[i]; } Value *back() const { return ValuePtrs.back(); } void pop_back() { ValuePtrs.pop_back(); } bool empty() const { return ValuePtrs.empty(); } void shrinkTo(unsigned N) { assert(N <= size() && "Invalid shrinkTo request!"); ValuePtrs.resize(N); } Constant *getConstantFwdRef(unsigned Idx, Type *Ty); Value *getValueFwdRef(unsigned Idx, Type *Ty); void AssignValue(Value *V, unsigned Idx); /// ResolveConstantForwardRefs - Once all constants are read, this method bulk /// resolves any forward references. void ResolveConstantForwardRefs(); }; //===----------------------------------------------------------------------===// // BitcodeReaderMDValueList Class //===----------------------------------------------------------------------===// class BitcodeReaderMDValueList { unsigned NumFwdRefs; bool AnyFwdRefs; + unsigned MinFwdRef; + unsigned MaxFwdRef; std::vector MDValuePtrs; LLVMContext &Context; public: BitcodeReaderMDValueList(LLVMContext &C) : NumFwdRefs(0), AnyFwdRefs(false), Context(C) {} // vector compatibility methods unsigned size() const { return MDValuePtrs.size(); } void resize(unsigned N) { MDValuePtrs.resize(N); } void push_back(Metadata *MD) { MDValuePtrs.emplace_back(MD); } void clear() { MDValuePtrs.clear(); } Metadata *back() const { return MDValuePtrs.back(); } void pop_back() { MDValuePtrs.pop_back(); } bool empty() const { return MDValuePtrs.empty(); } Metadata *operator[](unsigned i) const { assert(i < MDValuePtrs.size()); return MDValuePtrs[i]; } void shrinkTo(unsigned N) { assert(N <= size() && "Invalid shrinkTo request!"); MDValuePtrs.resize(N); } Metadata *getValueFwdRef(unsigned Idx); void AssignValue(Metadata *MD, unsigned Idx); void tryToResolveCycles(); }; class BitcodeReader : public GVMaterializer { LLVMContext &Context; DiagnosticHandlerFunction DiagnosticHandler; Module *TheModule; std::unique_ptr Buffer; std::unique_ptr StreamFile; BitstreamCursor Stream; DataStreamer *LazyStreamer; uint64_t NextUnreadBit; bool SeenValueSymbolTable; std::vector TypeList; BitcodeReaderValueList ValueList; BitcodeReaderMDValueList MDValueList; std::vector ComdatList; SmallVector InstructionList; std::vector > GlobalInits; std::vector > AliasInits; std::vector > FunctionPrefixes; std::vector > FunctionPrologues; SmallVector InstsWithTBAATag; /// MAttributes - The set of attributes by index. Index zero in the /// file is for null, and is thus not represented here. As such all indices /// are off by one. std::vector MAttributes; /// \brief The set of attribute groups. std::map MAttributeGroups; /// FunctionBBs - While parsing a function body, this is a list of the basic /// blocks for the function. std::vector FunctionBBs; // When reading the module header, this list is populated with functions that // have bodies later in the file. std::vector FunctionsWithBodies; // When intrinsic functions are encountered which require upgrading they are // stored here with their replacement function. typedef std::vector > UpgradedIntrinsicMap; UpgradedIntrinsicMap UpgradedIntrinsics; // Map the bitcode's custom MDKind ID to the Module's MDKind ID. DenseMap MDKindMap; // Several operations happen after the module header has been read, but // before function bodies are processed. This keeps track of whether // we've done this yet. bool SeenFirstFunctionBody; /// DeferredFunctionInfo - When function bodies are initially scanned, this /// map contains info about where to find deferred function body in the /// stream. DenseMap DeferredFunctionInfo; /// These are basic blocks forward-referenced by block addresses. They are /// inserted lazily into functions when they're loaded. The basic block ID is /// its index into the vector. DenseMap> BasicBlockFwdRefs; std::deque BasicBlockFwdRefQueue; /// UseRelativeIDs - Indicates that we are using a new encoding for /// instruction operands where most operands in the current /// FUNCTION_BLOCK are encoded relative to the instruction number, /// for a more compact encoding. Some instruction operands are not /// relative to the instruction ID: basic block numbers, and types. /// Once the old style function blocks have been phased out, we would /// not need this flag. bool UseRelativeIDs; /// True if all functions will be materialized, negating the need to process /// (e.g.) blockaddress forward references. bool WillMaterializeAllForwardRefs; /// Functions that have block addresses taken. This is usually empty. SmallPtrSet BlockAddressesTaken; public: std::error_code Error(BitcodeError E, const Twine &Message); std::error_code Error(BitcodeError E); std::error_code Error(const Twine &Message); explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C, DiagnosticHandlerFunction DiagnosticHandler); explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C, DiagnosticHandlerFunction DiagnosticHandler); ~BitcodeReader() { FreeState(); } std::error_code materializeForwardReferencedFunctions(); void FreeState(); void releaseBuffer(); bool isDematerializable(const GlobalValue *GV) const override; std::error_code materialize(GlobalValue *GV) override; std::error_code MaterializeModule(Module *M) override; std::vector getIdentifiedStructTypes() const override; void Dematerialize(GlobalValue *GV) override; /// @brief Main interface to parsing a bitcode buffer. /// @returns true if an error occurred. std::error_code ParseBitcodeInto(Module *M); /// @brief Cheap mechanism to just extract module triple /// @returns true if an error occurred. ErrorOr parseTriple(); static uint64_t decodeSignRotatedValue(uint64_t V); private: std::vector IdentifiedStructTypes; StructType *createIdentifiedStructType(LLVMContext &Context, StringRef Name); StructType *createIdentifiedStructType(LLVMContext &Context); Type *getTypeByID(unsigned ID); Value *getFnValueByID(unsigned ID, Type *Ty) { if (Ty && Ty->isMetadataTy()) return MetadataAsValue::get(Ty->getContext(), getFnMetadataByID(ID)); return ValueList.getValueFwdRef(ID, Ty); } Metadata *getFnMetadataByID(unsigned ID) { return MDValueList.getValueFwdRef(ID); } BasicBlock *getBasicBlock(unsigned ID) const { if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID return FunctionBBs[ID]; } AttributeSet getAttributes(unsigned i) const { if (i-1 < MAttributes.size()) return MAttributes[i-1]; return AttributeSet(); } /// getValueTypePair - Read a value/type pair out of the specified record from /// slot 'Slot'. Increment Slot past the number of slots used in the record. /// Return true on failure. bool getValueTypePair(SmallVectorImpl &Record, unsigned &Slot, unsigned InstNum, Value *&ResVal) { if (Slot == Record.size()) return true; unsigned ValNo = (unsigned)Record[Slot++]; // Adjust the ValNo, if it was encoded relative to the InstNum. if (UseRelativeIDs) ValNo = InstNum - ValNo; if (ValNo < InstNum) { // If this is not a forward reference, just return the value we already // have. ResVal = getFnValueByID(ValNo, nullptr); return ResVal == nullptr; } else if (Slot == Record.size()) { return true; } unsigned TypeNo = (unsigned)Record[Slot++]; ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo)); return ResVal == nullptr; } /// popValue - Read a value out of the specified record from slot 'Slot'. /// Increment Slot past the number of slots used by the value in the record. /// Return true if there is an error. bool popValue(SmallVectorImpl &Record, unsigned &Slot, unsigned InstNum, Type *Ty, Value *&ResVal) { if (getValue(Record, Slot, InstNum, Ty, ResVal)) return true; // All values currently take a single record slot. ++Slot; return false; } /// getValue -- Like popValue, but does not increment the Slot number. bool getValue(SmallVectorImpl &Record, unsigned Slot, unsigned InstNum, Type *Ty, Value *&ResVal) { ResVal = getValue(Record, Slot, InstNum, Ty); return ResVal == nullptr; } /// getValue -- Version of getValue that returns ResVal directly, /// or 0 if there is an error. Value *getValue(SmallVectorImpl &Record, unsigned Slot, unsigned InstNum, Type *Ty) { if (Slot == Record.size()) return nullptr; unsigned ValNo = (unsigned)Record[Slot]; // Adjust the ValNo, if it was encoded relative to the InstNum. if (UseRelativeIDs) ValNo = InstNum - ValNo; return getFnValueByID(ValNo, Ty); } /// getValueSigned -- Like getValue, but decodes signed VBRs. Value *getValueSigned(SmallVectorImpl &Record, unsigned Slot, unsigned InstNum, Type *Ty) { if (Slot == Record.size()) return nullptr; unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]); // Adjust the ValNo, if it was encoded relative to the InstNum. if (UseRelativeIDs) ValNo = InstNum - ValNo; return getFnValueByID(ValNo, Ty); } std::error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind); std::error_code ParseModule(bool Resume); std::error_code ParseAttributeBlock(); std::error_code ParseAttributeGroupBlock(); std::error_code ParseTypeTable(); std::error_code ParseTypeTableBody(); std::error_code ParseValueSymbolTable(); std::error_code ParseConstants(); std::error_code RememberAndSkipFunctionBody(); std::error_code ParseFunctionBody(Function *F); std::error_code GlobalCleanup(); std::error_code ResolveGlobalAndAliasInits(); std::error_code ParseMetadata(); std::error_code ParseMetadataAttachment(); ErrorOr parseModuleTriple(); std::error_code ParseUseLists(); std::error_code InitStream(); std::error_code InitStreamFromBuffer(); std::error_code InitLazyStream(); std::error_code FindFunctionInStream( Function *F, DenseMap::iterator DeferredFunctionInfoIterator); }; } // End llvm namespace #endif Index: projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 279025) @@ -1,12782 +1,12788 @@ //===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run // both before and after the DAG is legalized. // // This pass is not a substitute for the LLVM IR instcombine pass. This pass is // primarily intended to handle simplification opportunities that are implicit // in the LLVM IR and exposed by the various codegen lowering phases. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; #define DEBUG_TYPE "dagcombine" STATISTIC(NodesCombined , "Number of dag nodes combined"); STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); STATISTIC(SlicedLoads, "Number of load sliced"); namespace { static cl::opt CombinerAA("combiner-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner alias-analysis heuristics")); static cl::opt CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner's use of IR alias analysis")); static cl::opt UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), cl::desc("Enable DAG combiner's use of TBAA")); #ifndef NDEBUG static cl::opt CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, cl::desc("Only use DAG-combiner alias analysis in this" " function")); #endif /// Hidden option to stress test load slicing, i.e., when this option /// is enabled, load slicing bypasses most of its profitability guards. static cl::opt StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, cl::desc("Bypass the profitability model of load " "slicing"), cl::init(false)); static cl::opt MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), cl::desc("DAG combiner may split indexing from loads")); //------------------------------ DAGCombiner ---------------------------------// class DAGCombiner { SelectionDAG &DAG; const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; bool LegalOperations; bool LegalTypes; bool ForCodeSize; /// \brief Worklist of all of the nodes that need to be simplified. /// /// This must behave as a stack -- new nodes to process are pushed onto the /// back and when processing we pop off of the back. /// /// The worklist will not contain duplicates but may contain null entries /// due to nodes being deleted from the underlying DAG. SmallVector Worklist; /// \brief Mapping from an SDNode to its position on the worklist. /// /// This is used to find and remove nodes from the worklist (by nulling /// them) when they are deleted from the underlying DAG. It relies on /// stable indices of nodes within the worklist. DenseMap WorklistMap; /// \brief Set of nodes which have been combined (at least once). /// /// This is used to allow us to reliably add any operands of a DAG node /// which have not yet been combined to the worklist. SmallPtrSet CombinedNodes; // AA - Used for DAG load/store alias analysis. AliasAnalysis &AA; /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. void AddUsersToWorklist(SDNode *N) { for (SDNode *Node : N->uses()) AddToWorklist(Node); } /// Call the node-specific routine that folds each particular type of node. SDValue visit(SDNode *N); public: /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { // Skip handle nodes as they can't usefully be combined and confuse the // zero-use deletion strategy. if (N->getOpcode() == ISD::HANDLENODE) return; if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) Worklist.push_back(N); } /// Remove all instances of N from the worklist. void removeFromWorklist(SDNode *N) { CombinedNodes.erase(N); auto It = WorklistMap.find(N); if (It == WorklistMap.end()) return; // Not in the worklist. // Null out the entry rather than erasing it to avoid a linear operation. Worklist[It->second] = nullptr; WorklistMap.erase(It); } void deleteAndRecombine(SDNode *N); bool recursivelyDeleteUnusedNodes(SDNode *N); SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo = true); SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { return CombineTo(N, &Res, 1, AddTo); } SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true) { SDValue To[] = { Res0, Res1 }; return CombineTo(N, To, 2, AddTo); } void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: /// Check the specified integer node value to see if it can be simplified or /// if things it uses can be simplified by bit propagation. /// If so, return true. bool SimplifyDemandedBits(SDValue Op) { unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); APInt Demanded = APInt::getAllOnesValue(BitWidth); return SimplifyDemandedBits(Op, Demanded); } bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded); bool CombineToPreIndexedLoadStore(SDNode *N); bool CombineToPostIndexedLoadStore(SDNode *N); SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed /// load. /// /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. /// \param InVecVT type of the input vector to EVE with bitcasts resolved. /// \param EltNo index of the vector element to load. /// \param OriginalLoad load that EVE came from to be replaced. /// \returns EVE on success SDValue() on failure. SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad( SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); SDValue SExtPromoteOperand(SDValue Op, EVT PVT); SDValue ZExtPromoteOperand(SDValue Op, EVT PVT); SDValue PromoteIntBinOp(SDValue Op); SDValue PromoteIntShiftOp(SDValue Op); SDValue PromoteExtend(SDValue Op); bool PromoteLoad(SDValue Op); void ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue Trunc, SDValue ExtLoad, SDLoc DL, ISD::NodeType ExtType); /// Call the node-specific routine that knows how to fold each /// particular type of node. If that doesn't do anything, try the /// target-specific DAG combines. SDValue combine(SDNode *N); // Visitation implementation - Implement dag node combining for different // node types. The semantics are as follows: // Return Value: // SDValue.getNode() == 0 - No change was made // SDValue.getNode() == N - N was replaced, is dead and has been handled. // otherwise - N should be replaced by the returned Operand. // SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDC(SDNode *N); SDValue visitSUBC(SDNode *N); SDValue visitADDE(SDNode *N); SDValue visitSUBE(SDNode *N); SDValue visitMUL(SDNode *N); SDValue visitSDIV(SDNode *N); SDValue visitUDIV(SDNode *N); SDValue visitSREM(SDNode *N); SDValue visitUREM(SDNode *N); SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitSMULO(SDNode *N); SDValue visitUMULO(SDNode *N); SDValue visitSDIVREM(SDNode *N); SDValue visitUDIVREM(SDNode *N); SDValue visitAND(SDNode *N); SDValue visitOR(SDNode *N); SDValue visitXOR(SDNode *N); SDValue SimplifyVBinOp(SDNode *N); SDValue SimplifyVUnaryOp(SDNode *N); SDValue visitSHL(SDNode *N); SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); SDValue visitRotate(SDNode *N); SDValue visitCTLZ(SDNode *N); SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); SDValue visitCTTZ(SDNode *N); SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); SDValue visitCTPOP(SDNode *N); SDValue visitSELECT(SDNode *N); SDValue visitVSELECT(SDNode *N); SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitTRUNCATE(SDNode *N); SDValue visitBITCAST(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); SDValue visitFADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); SDValue visitFMA(SDNode *N); SDValue visitFDIV(SDNode *N); SDValue visitFREM(SDNode *N); SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); SDValue visitFP_TO_UINT(SDNode *N); SDValue visitFP_ROUND(SDNode *N); SDValue visitFP_ROUND_INREG(SDNode *N); SDValue visitFP_EXTEND(SDNode *N); SDValue visitFNEG(SDNode *N); SDValue visitFABS(SDNode *N); SDValue visitFCEIL(SDNode *N); SDValue visitFTRUNC(SDNode *N); SDValue visitFFLOOR(SDNode *N); SDValue visitFMINNUM(SDNode *N); SDValue visitFMAXNUM(SDNode *N); SDValue visitBRCOND(SDNode *N); SDValue visitBR_CC(SDNode *N); SDValue visitLOAD(SDNode *N); SDValue visitSTORE(SDNode *N); SDValue visitINSERT_VECTOR_ELT(SDNode *N); SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); SDValue visitBUILD_VECTOR(SDNode *N); SDValue visitCONCAT_VECTORS(SDNode *N); SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitINSERT_SUBVECTOR(SDNode *N); SDValue visitMLOAD(SDNode *N); SDValue visitMSTORE(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(SDLoc DL, SDValue N0, SDValue N1, SDValue N2); SDValue SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare = false); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, SDLoc DL, bool foldBooleans = true); bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const; bool isOneUseSetCC(SDValue N) const; SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); SDValue BuildReciprocalEstimate(SDValue Op); SDValue BuildRsqrtEstimate(SDValue Op); SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations); SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, SDLoc DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue TransformFPLoadStorePair(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue GetDemandedBits(SDValue V, const APInt &Mask); /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases); /// Return true if there is any possibility that the two addresses overlap. bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const; /// Walk up chain skipping non-aliasing memory nodes, looking for a better /// chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return True if some memory operations were changed. bool MergeConsecutiveStores(StoreSDNode *N); /// \brief Try to transform a truncation where C is a constant: /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) /// /// \p N needs to be a truncation and its first operand an AND. Other /// requirements are checked by the function (e.g. that trunc is /// single-use) and if missed an empty SDValue is returned. SDValue distributeTruncateThroughAnd(SDNode *N); public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { AttributeSet FnAttrs = DAG.getMachineFunction().getFunction()->getAttributes(); ForCodeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) || FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); } /// Runs the dag combiner on all nodes in the work list void Run(CombineLevel AtLevel); SelectionDAG &getDAG() const { return DAG; } /// Returns a type large enough to hold any valid shift amount - before type /// legalization these can be huge. EVT getShiftAmountTy(EVT LHSTy) { assert(LHSTy.isInteger() && "Shift amount is not an integer type!"); if (LHSTy.isVector()) return LHSTy; return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy) : TLI.getPointerTy(); } /// This method returns true if we are running before type legalization or /// if the specified VT is legal. bool isTypeLegal(const EVT &VT) { if (!LegalTypes) return true; return TLI.isTypeLegal(VT); } /// Convenience wrapper around TargetLowering::getSetCCResultType EVT getSetCCResultType(EVT VT) const { return TLI.getSetCCResultType(*DAG.getContext(), VT); } }; } namespace { /// This class is a DAGUpdateListener that removes any deleted /// nodes from the worklist. class WorklistRemover : public SelectionDAG::DAGUpdateListener { DAGCombiner &DC; public: explicit WorklistRemover(DAGCombiner &dc) : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} void NodeDeleted(SDNode *N, SDNode *E) override { DC.removeFromWorklist(N); } }; } //===----------------------------------------------------------------------===// // TargetLowering::DAGCombinerInfo implementation //===----------------------------------------------------------------------===// void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { ((DAGCombiner*)DC)->AddToWorklist(N); } void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) { ((DAGCombiner*)DC)->removeFromWorklist(N); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, const std::vector &To, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); } void TargetLowering::DAGCombinerInfo:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); } //===----------------------------------------------------------------------===// // Helper Functions //===----------------------------------------------------------------------===// void DAGCombiner::deleteAndRecombine(SDNode *N) { removeFromWorklist(N); // If the operands of this node are only used by the node, they will now be // dead. Make sure to re-visit them and recursively delete dead nodes. for (const SDValue &Op : N->ops()) // For an operand generating multiple values, one of the values may // become dead allowing further simplification (e.g. split index // arithmetic from an indexed load). if (Op->hasOneUse() || Op->getNumValues() > 1) AddToWorklist(Op.getNode()); DAG.DeleteNode(N); } /// Return 1 if we can compute the negated form of the specified expression for /// the same cost as the expression itself, or 2 if we can compute the negated /// form more cheaply than the expression itself. static char isNegatibleForFree(SDValue Op, bool LegalOperations, const TargetLowering &TLI, const TargetOptions *Options, unsigned Depth = 0) { // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) return 2; // Don't allow anything with multiple uses. if (!Op.hasOneUse()) return 0; // Don't recurse exponentially. if (Depth > 6) return 0; switch (Op.getOpcode()) { default: return false; case ISD::ConstantFP: // Don't invert constant FP values after legalize. The negated constant // isn't necessarily legal. return LegalOperations ? 0 : 1; case ISD::FADD: // FIXME: determine better conditions for this xform. if (!Options->UnsafeFPMath) return 0; // After operation legalization, it might not be legal to create new FSUBs. if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) return 0; // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1)) return V; // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. if (!Options->UnsafeFPMath) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) return 1; case ISD::FMUL: case ISD::FDIV: if (Options->HonorSignDependentRoundingFPMath()) return 0; // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1)) return V; return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, Depth + 1); case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FSIN: return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1); } } /// If isNegatibleForFree returns true, return the newly negated expression. static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, unsigned Depth = 0) { const TargetOptions &Options = DAG.getTarget().Options; // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0); // Don't allow anything with multiple uses. assert(Op.hasOneUse() && "Unknown reuse!"); assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); case ISD::ConstantFP: { APFloat V = cast(Op)->getValueAPF(); V.changeSign(); return DAG.getConstantFP(V, Op.getValueType()); } case ISD::FADD: // FIXME: determine better conditions for this xform. assert(Options.UnsafeFPMath); // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, DAG.getTargetLoweringInfo(), &Options, Depth+1)) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1)); // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1), Op.getOperand(0)); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. assert(Options.UnsafeFPMath); // fold (fneg (fsub 0, B)) -> B if (ConstantFPSDNode *N0CFP = dyn_cast(Op.getOperand(0))) if (N0CFP->getValueAPF().isZero()) return Op.getOperand(1); // fold (fneg (fsub A, B)) -> (fsub B, A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(0)); case ISD::FMUL: case ISD::FDIV: assert(!Options.HonorSignDependentRoundingFPMath()); // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, DAG.getTargetLoweringInfo(), &Options, Depth+1)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1)); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1)); case ISD::FP_EXTEND: case ISD::FSIN: return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1)); case ISD::FP_ROUND: return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1)); } } // Return true if this node is a setcc, or is a select_cc // that selects between the target values used for true and false, making it // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to // the appropriate nodes based on the type of node we are checking. This // simplifies life a bit for the callers. bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const { if (N.getOpcode() == ISD::SETCC) { LHS = N.getOperand(0); RHS = N.getOperand(1); CC = N.getOperand(2); return true; } if (N.getOpcode() != ISD::SELECT_CC || !TLI.isConstTrueVal(N.getOperand(2).getNode()) || !TLI.isConstFalseVal(N.getOperand(3).getNode())) return false; if (TLI.getBooleanContents(N.getValueType()) == TargetLowering::UndefinedBooleanContent) return false; LHS = N.getOperand(0); RHS = N.getOperand(1); CC = N.getOperand(4); return true; } /// Return true if this is a SetCC-equivalent operation with only one use. /// If this is true, it allows the users to invert the operation for free when /// it is profitable to do so. bool DAGCombiner::isOneUseSetCC(SDValue N) const { SDValue N0, N1, N2; if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse()) return true; return false; } /// Returns true if N is a BUILD_VECTOR node whose /// elements are all the same constant or undefined. static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) { BuildVectorSDNode *C = dyn_cast(N); if (!C) return false; APInt SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; EVT EltVT = N->getValueType(0).getVectorElementType(); return (C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs) && EltVT.getSizeInBits() >= SplatBitSize); } // \brief Returns the SDNode if it is a constant BuildVector or constant. static SDNode *isConstantBuildVectorOrConstantInt(SDValue N) { if (isa(N)) return N.getNode(); BuildVectorSDNode *BV = dyn_cast(N); if (BV && BV->isConstant()) return BV; return nullptr; } // \brief Returns the SDNode if it is a constant splat BuildVector or constant // int. static ConstantSDNode *isConstOrConstSplat(SDValue N) { if (ConstantSDNode *CN = dyn_cast(N)) return CN; if (BuildVectorSDNode *BV = dyn_cast(N)) { BitVector UndefElements; ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements); // BuildVectors can truncate their operands. Ignore that case here. // FIXME: We blindly ignore splats which include undef which is overly // pessimistic. if (CN && UndefElements.none() && CN->getValueType(0) == N.getValueType().getScalarType()) return CN; } return nullptr; } // \brief Returns the SDNode if it is a constant splat BuildVector or constant // float. static ConstantFPSDNode *isConstOrConstSplatFP(SDValue N) { if (ConstantFPSDNode *CN = dyn_cast(N)) return CN; if (BuildVectorSDNode *BV = dyn_cast(N)) { BitVector UndefElements; ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements); if (CN && UndefElements.none()) return CN; } return nullptr; } SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL, SDValue N0, SDValue N1) { EVT VT = N0.getValueType(); if (N0.getOpcode() == Opc) { if (SDNode *L = isConstantBuildVectorOrConstantInt(N0.getOperand(1))) { if (SDNode *R = isConstantBuildVectorOrConstantInt(N1)) { // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R)) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); return SDValue(); } if (N0.hasOneUse()) { // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one // use SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); if (!OpNode.getNode()) return SDValue(); AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); } } } if (N1.getOpcode() == Opc) { if (SDNode *R = isConstantBuildVectorOrConstantInt(N1.getOperand(1))) { if (SDNode *L = isConstantBuildVectorOrConstantInt(N0)) { // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L)) return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); return SDValue(); } if (N1.hasOneUse()) { // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one // use SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N1.getOperand(0), N0); if (!OpNode.getNode()) return SDValue(); AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1)); } } } return SDValue(); } SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); ++NodesCombined; DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: "; To[0].getNode()->dump(&DAG); dbgs() << " and " << NumTo-1 << " other values\n"); for (unsigned i = 0, e = NumTo; i != e; ++i) assert((!To[i].getNode() || N->getValueType(i) == To[i].getValueType()) && "Cannot combine value to value of different type!"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesWith(N, To); if (AddTo) { // Push the new nodes and any users onto the worklist for (unsigned i = 0, e = NumTo; i != e; ++i) { if (To[i].getNode()) { AddToWorklist(To[i].getNode()); AddUsersToWorklist(To[i].getNode()); } } } // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (N->use_empty()) deleteAndRecombine(N); return SDValue(N, 0); } void DAGCombiner:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { // Replace all uses. If any nodes become isomorphic to other nodes and // are deleted, make sure to remove them from our worklist. WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); // Push the new node and any (possibly new) users onto the worklist. AddToWorklist(TLO.New.getNode()); AddUsersToWorklist(TLO.New.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (TLO.Old.getNode()->use_empty()) deleteAndRecombine(TLO.Old.getNode()); } /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); APInt KnownZero, KnownOne; if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) return false; // Revisit the node. AddToWorklist(Op.getNode()); // Replace the old value with the new one. ++NodesCombined; DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n'); CommitTargetLoweringOpt(TLO); return true; } void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { SDLoc dl(Load); EVT VT = Load->getValueType(0); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, SDValue(ExtLoad, 0)); DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: "; Trunc.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); deleteAndRecombine(Load); AddToWorklist(Trunc.getNode()); } SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { Replace = false; SDLoc dl(Op); if (LoadSDNode *LD = dyn_cast(Op)) { EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD) : LD->getExtensionType(); Replace = true; return DAG.getExtLoad(ExtType, dl, PVT, LD->getChain(), LD->getBasePtr(), MemVT, LD->getMemOperand()); } unsigned Opc = Op.getOpcode(); switch (Opc) { default: break; case ISD::AssertSext: return DAG.getNode(ISD::AssertSext, dl, PVT, SExtPromoteOperand(Op.getOperand(0), PVT), Op.getOperand(1)); case ISD::AssertZext: return DAG.getNode(ISD::AssertZext, dl, PVT, ZExtPromoteOperand(Op.getOperand(0), PVT), Op.getOperand(1)); case ISD::Constant: { unsigned ExtOpc = Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, PVT, Op); } } if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT)) return SDValue(); return DAG.getNode(ISD::ANY_EXTEND, dl, PVT, Op); } SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT)) return SDValue(); EVT OldVT = Op.getValueType(); SDLoc dl(Op); bool Replace = false; SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NewOp.getValueType(), NewOp, DAG.getValueType(OldVT)); } SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { EVT OldVT = Op.getValueType(); SDLoc dl(Op); bool Replace = false; SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); return DAG.getZeroExtendInReg(NewOp, dl, OldVT); } /// Promote the specified integer binary operation if the target indicates it is /// beneficial. e.g. On x86, it's usually better to promote i16 operations to /// i32 since i16 instructions are longer. SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); bool Replace0 = false; SDValue N0 = Op.getOperand(0); SDValue NN0 = PromoteOperand(N0, PVT, Replace0); if (!NN0.getNode()) return SDValue(); bool Replace1 = false; SDValue N1 = Op.getOperand(1); SDValue NN1; if (N0 == N1) NN1 = NN0; else { NN1 = PromoteOperand(N1, PVT, Replace1); if (!NN1.getNode()) return SDValue(); } AddToWorklist(NN0.getNode()); if (NN1.getNode()) AddToWorklist(NN1.getNode()); if (Replace0) ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); if (Replace1) ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); SDLoc dl(Op); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Opc, dl, PVT, NN0, NN1)); } return SDValue(); } /// Promote the specified integer shift operation if the target indicates it is /// beneficial. e.g. On x86, it's usually better to promote i16 operations to /// i32 since i16 instructions are longer. SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); bool Replace = false; SDValue N0 = Op.getOperand(0); if (Opc == ISD::SRA) N0 = SExtPromoteOperand(Op.getOperand(0), PVT); else if (Opc == ISD::SRL) N0 = ZExtPromoteOperand(Op.getOperand(0), PVT); else N0 = PromoteOperand(N0, PVT, Replace); if (!N0.getNode()) return SDValue(); AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); SDLoc dl(Op); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Opc, dl, PVT, N0, Op.getOperand(1))); } return SDValue(); } SDValue DAGCombiner::PromoteExtend(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0)); } return SDValue(); } bool DAGCombiner::PromoteLoad(SDValue Op) { if (!LegalOperations) return false; EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return false; // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return false; EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); SDLoc dl(Op); SDNode *N = Op.getNode(); LoadSDNode *LD = cast(N); EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD) : LD->getExtensionType(); SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT, LD->getChain(), LD->getBasePtr(), MemVT, LD->getMemOperand()); SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD); DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); deleteAndRecombine(N); AddToWorklist(Result.getNode()); return true; } return false; } /// \brief Recursively delete a node which has no uses and any operands for /// which it is the only use. /// /// Note that this both deletes the nodes and removes them from the worklist. /// It also adds any nodes who have had a user deleted to the worklist as they /// may now have only one use and subject to other combines. bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { if (!N->use_empty()) return false; SmallSetVector Nodes; Nodes.insert(N); do { N = Nodes.pop_back_val(); if (!N) continue; if (N->use_empty()) { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) Nodes.insert(N->getOperand(i).getNode()); removeFromWorklist(N); DAG.DeleteNode(N); } else { AddToWorklist(N); } } while (!Nodes.empty()); return true; } //===----------------------------------------------------------------------===// // Main DAG Combiner implementation //===----------------------------------------------------------------------===// void DAGCombiner::Run(CombineLevel AtLevel) { // set the instance variables, so that the various visit routines may use it. Level = AtLevel; LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; // Early exit if this basic block is in an optnone function. AttributeSet FnAttrs = DAG.getMachineFunction().getFunction()->getAttributes(); if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeNone)) return; // Add all the dag nodes to the worklist. for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) AddToWorklist(I); // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any // changes of the root. HandleSDNode Dummy(DAG.getRoot()); // while the worklist isn't empty, find a node and // try and combine it. while (!WorklistMap.empty()) { SDNode *N; // The Worklist holds the SDNodes in order, but it may contain null entries. do { N = Worklist.pop_back_val(); } while (!N); bool GoodWorklistEntry = WorklistMap.erase(N); (void)GoodWorklistEntry; assert(GoodWorklistEntry && "Found a worklist entry without a corresponding map entry!"); // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. if (recursivelyDeleteUnusedNodes(N)) continue; WorklistRemover DeadNodes(*this); // If this combine is running after legalizing the DAG, re-legalize any // nodes pulled off the worklist. if (Level == AfterLegalizeDAG) { SmallSetVector UpdatedNodes; bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); for (SDNode *LN : UpdatedNodes) { AddToWorklist(LN); AddUsersToWorklist(LN); } if (!NIsValid) continue; } DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. CombinedNodes.insert(N); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) if (!CombinedNodes.count(N->getOperand(i).getNode())) AddToWorklist(N->getOperand(i).getNode()); SDValue RV = combine(N); if (!RV.getNode()) continue; ++NodesCombined; // If we get back the same node we passed in, rather than a new node or // zero, we know that the node must have defined multiple values and // CombineTo was used. Since CombineTo takes care of the worklist // mechanics for us, we have no work to do in this case. if (RV.getNode() == N) continue; assert(N->getOpcode() != ISD::DELETED_NODE && RV.getNode()->getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned new node!"); DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG)); // Transfer debug value. DAG.TransferDbgValues(SDValue(N, 0), RV); if (N->getNumValues() == RV.getNode()->getNumValues()) DAG.ReplaceAllUsesWith(N, RV.getNode()); else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); SDValue OpV = RV; DAG.ReplaceAllUsesWith(N, &OpV); } // Push the new node and any users onto the worklist AddToWorklist(RV.getNode()); AddUsersToWorklist(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. This will also take care of adding any // operands which have lost a user to the worklist. recursivelyDeleteUnusedNodes(N); } // If the root changed (e.g. it was a dead load, update the root). DAG.setRoot(Dummy.getValue()); DAG.RemoveDeadNodes(); } SDValue DAGCombiner::visit(SDNode *N) { switch (N->getOpcode()) { default: break; case ISD::TokenFactor: return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD: return visitADD(N); case ISD::SUB: return visitSUB(N); case ISD::ADDC: return visitADDC(N); case ISD::SUBC: return visitSUBC(N); case ISD::ADDE: return visitADDE(N); case ISD::SUBE: return visitSUBE(N); case ISD::MUL: return visitMUL(N); case ISD::SDIV: return visitSDIV(N); case ISD::UDIV: return visitUDIV(N); case ISD::SREM: return visitSREM(N); case ISD::UREM: return visitUREM(N); case ISD::MULHU: return visitMULHU(N); case ISD::MULHS: return visitMULHS(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: return visitSMULO(N); case ISD::UMULO: return visitUMULO(N); case ISD::SDIVREM: return visitSDIVREM(N); case ISD::UDIVREM: return visitUDIVREM(N); case ISD::AND: return visitAND(N); case ISD::OR: return visitOR(N); case ISD::XOR: return visitXOR(N); case ISD::SHL: return visitSHL(N); case ISD::SRA: return visitSRA(N); case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); case ISD::CTLZ: return visitCTLZ(N); case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); case ISD::CTTZ: return visitCTTZ(N); case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); case ISD::CTPOP: return visitCTPOP(N); case ISD::SELECT: return visitSELECT(N); case ISD::VSELECT: return visitVSELECT(N); case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::TRUNCATE: return visitTRUNCATE(N); case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); case ISD::FADD: return visitFADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); case ISD::FMA: return visitFMA(N); case ISD::FDIV: return visitFDIV(N); case ISD::FREM: return visitFREM(N); case ISD::FSQRT: return visitFSQRT(N); case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); case ISD::FP_ROUND: return visitFP_ROUND(N); case ISD::FP_ROUND_INREG: return visitFP_ROUND_INREG(N); case ISD::FP_EXTEND: return visitFP_EXTEND(N); case ISD::FNEG: return visitFNEG(N); case ISD::FABS: return visitFABS(N); case ISD::FFLOOR: return visitFFLOOR(N); case ISD::FMINNUM: return visitFMINNUM(N); case ISD::FMAXNUM: return visitFMAXNUM(N); case ISD::FCEIL: return visitFCEIL(N); case ISD::FTRUNC: return visitFTRUNC(N); case ISD::BRCOND: return visitBRCOND(N); case ISD::BR_CC: return visitBR_CC(N); case ISD::LOAD: return visitLOAD(N); case ISD::STORE: return visitSTORE(N); case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N); case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); case ISD::MLOAD: return visitMLOAD(N); case ISD::MSTORE: return visitMSTORE(N); } return SDValue(); } SDValue DAGCombiner::combine(SDNode *N) { SDValue RV = visit(N); // If nothing happened, try a target-specific DAG combine. if (!RV.getNode()) { assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned NULL!"); if (N->getOpcode() >= ISD::BUILTIN_OP_END || TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) { // Expose the DAG combiner to the target combiner impls. TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); RV = TLI.PerformDAGCombine(N, DagCombineInfo); } } // If nothing happened still, try promoting the operation. if (!RV.getNode()) { switch (N->getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: RV = PromoteIntBinOp(SDValue(N, 0)); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: RV = PromoteIntShiftOp(SDValue(N, 0)); break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: RV = PromoteExtend(SDValue(N, 0)); break; case ISD::LOAD: if (PromoteLoad(SDValue(N, 0))) RV = SDValue(N, 0); break; } } // If N is a commutative binary node, try commuting it to enable more // sdisel CSE. if (!RV.getNode() && SelectionDAG::isCommutativeBinOp(N->getOpcode()) && N->getNumValues() == 1) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Constant operands are canonicalized to RHS. if (isa(N0) || !isa(N1)) { SDValue Ops[] = {N1, N0}; SDNode *CSENode; if (const BinaryWithFlagsSDNode *BinNode = dyn_cast(N)) { CSENode = DAG.getNodeIfExists( N->getOpcode(), N->getVTList(), Ops, BinNode->hasNoUnsignedWrap(), BinNode->hasNoSignedWrap(), BinNode->isExact()); } else { CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops); } if (CSENode) return SDValue(CSENode, 0); } } return RV; } /// Given a node, return its input chain if it has one, otherwise return a null /// sd operand. static SDValue getInputChainForNode(SDNode *N) { if (unsigned NumOps = N->getNumOperands()) { if (N->getOperand(0).getValueType() == MVT::Other) return N->getOperand(0); if (N->getOperand(NumOps-1).getValueType() == MVT::Other) return N->getOperand(NumOps-1); for (unsigned i = 1; i < NumOps-1; ++i) if (N->getOperand(i).getValueType() == MVT::Other) return N->getOperand(i); } return SDValue(); } SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // If N has two operands, where one has an input chain equal to the other, // the 'other' chain is redundant. if (N->getNumOperands() == 2) { if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1)) return N->getOperand(0); if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0)) return N->getOperand(1); } SmallVector TFs; // List of token factors to visit. SmallVector Ops; // Ops for replacing token factor. SmallPtrSet SeenOps; bool Changed = false; // If we should replace this token factor. // Start out with this token factor. TFs.push_back(N); // Iterate through token factors. The TFs grows when new token factors are // encountered. for (unsigned i = 0; i < TFs.size(); ++i) { SDNode *TF = TFs[i]; // Check each of the operands. for (unsigned i = 0, ie = TF->getNumOperands(); i != ie; ++i) { SDValue Op = TF->getOperand(i); switch (Op.getOpcode()) { case ISD::EntryToken: // Entry tokens don't need to be added to the list. They are // rededundant. Changed = true; break; case ISD::TokenFactor: if (Op.hasOneUse() && std::find(TFs.begin(), TFs.end(), Op.getNode()) == TFs.end()) { // Queue up for processing. TFs.push_back(Op.getNode()); // Clean up in case the token factor is removed. AddToWorklist(Op.getNode()); Changed = true; break; } // Fall thru default: // Only add if it isn't already in the list. if (SeenOps.insert(Op.getNode()).second) Ops.push_back(Op); else Changed = true; break; } } } SDValue Result; // If we've change things around then replace token factor. if (Changed) { if (Ops.empty()) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); } else { // New and improved token factor. Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); } // Don't add users to work list. return CombineTo(N, Result, false); } return Result; } /// MERGE_VALUES can always be eliminated. SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { WorklistRemover DeadNodes(*this); // Replacing results may cause a different MERGE_VALUES to suddenly // be CSE'd with N, and carry its uses with it. Iterate until no // uses remain, to ensure that the node can be safely deleted. // First add the users of this node to the work list so that they // can be tried again once they have new operands. AddUsersToWorklist(N); do { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i)); } while (!N->use_empty()); deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } SDValue DAGCombiner::visitADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; // fold (add x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; } // fold (add x, undef) -> undef if (N0.getOpcode() == ISD::UNDEF) return N0; if (N1.getOpcode() == ISD::UNDEF) return N1; // fold (add c1, c2) -> c1+c2 if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C); // canonicalize constant to RHS if (N0C && !N1C) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, N0); // fold (add x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // fold (add Sym, c) -> Sym+c if (GlobalAddressSDNode *GA = dyn_cast(N0)) if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C && GA->getOpcode() == ISD::GlobalAddress) return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, GA->getOffset() + (uint64_t)N1C->getSExtValue()); // fold ((c1-A)+c2) -> (c1+c2)-A if (N1C && N0.getOpcode() == ISD::SUB) if (ConstantSDNode *N0C = dyn_cast(N0.getOperand(0))) return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(N1C->getAPIntValue()+ N0C->getAPIntValue(), VT), N0.getOperand(1)); // reassociate add SDValue RADD = ReassociateOps(ISD::ADD, SDLoc(N), N0, N1); if (RADD.getNode()) return RADD; // fold ((0-A) + B) -> B-A if (N0.getOpcode() == ISD::SUB && isa(N0.getOperand(0)) && cast(N0.getOperand(0))->isNullValue()) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1, N0.getOperand(1)); // fold (A + (0-B)) -> A-B if (N1.getOpcode() == ISD::SUB && isa(N1.getOperand(0)) && cast(N1.getOperand(0))->isNullValue()) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1.getOperand(1)); // fold (A+(B-A)) -> B if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1)) return N1.getOperand(0); // fold ((B-A)+A) -> B if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) return N0.getOperand(0); // fold (A+(B-(A+C))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(0)) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1.getOperand(0), N1.getOperand(1).getOperand(1)); // fold (A+(B-(C+A))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(1)) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1.getOperand(0), N1.getOperand(1).getOperand(0)); // fold (A+((B-A)+or-C)) to (B+or-C) if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) && N1.getOperand(0).getOpcode() == ISD::SUB && N0 == N1.getOperand(0).getOperand(1)) return DAG.getNode(N1.getOpcode(), SDLoc(N), VT, N1.getOperand(0).getOperand(0), N1.getOperand(1)); // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); if (isa(N00) || isa(N10)) return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); } if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (a+b) -> (a|b) iff a and b share no bits. if (VT.isInteger() && !VT.isVector()) { APInt LHSZero, LHSOne; APInt RHSZero, RHSOne; DAG.computeKnownBits(N0, LHSZero, LHSOne); if (LHSZero.getBoolValue()) { DAG.computeKnownBits(N1, RHSZero, RHSOne); // If all possibly-set bits on the LHS are clear on the RHS, return an OR. // If all possibly-set bits on the RHS are clear on the LHS, return an OR. if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero){ if (!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1); } } } // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast(N1.getOperand(0).getOperand(0))) if (C->getAPIntValue() == 0) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, DAG.getNode(ISD::SHL, SDLoc(N), VT, N1.getOperand(0).getOperand(1), N1.getOperand(1))); if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast(N0.getOperand(0).getOperand(0))) if (C->getAPIntValue() == 0) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1, DAG.getNode(ISD::SHL, SDLoc(N), VT, N0.getOperand(0).getOperand(1), N0.getOperand(1))); if (N1.getOpcode() == ISD::AND) { SDValue AndOp0 = N1.getOperand(0); ConstantSDNode *AndOp1 = dyn_cast(N1->getOperand(1)); unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0); unsigned DestBits = VT.getScalarType().getSizeInBits(); // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x)) // and similar xforms where the inner op is either ~0 or 0. if (NumSignBits == DestBits && AndOp1 && AndOp1->isOne()) { SDLoc DL(N); return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0); } } // add (sext i1), X -> sub X, (zext i1) if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.getOperand(0).getValueType() == MVT::i1 && !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) { SDLoc DL(N); SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); } // add X, (sextinreg Y i1) -> sub X, (and Y 1) if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { VTSDNode *TN = cast(N1.getOperand(1)); if (TN->getVT() == MVT::i1) { SDLoc DL(N); SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), DAG.getConstant(1, VT)); return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt); } } return SDValue(); } SDValue DAGCombiner::visitADDC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); // canonicalize constant to RHS. if (N0C && !N1C) return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0); // fold (addc x, 0) -> x + no carry out if (N1C && N1C->isNullValue()) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits. APInt LHSZero, LHSOne; APInt RHSZero, RHSOne; DAG.computeKnownBits(N0, LHSZero, LHSOne); if (LHSZero.getBoolValue()) { DAG.computeKnownBits(N1, RHSZero, RHSOne); // If all possibly-set bits on the LHS are clear on the RHS, return an OR. // If all possibly-set bits on the RHS are clear on the LHS, return an OR. if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero) return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); } return SDValue(); } SDValue DAGCombiner::visitADDE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); // canonicalize constant to RHS if (N0C && !N1C) return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(), N1, N0, CarryIn); // fold (adde x, y, false) -> (addc x, y) if (CarryIn.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } // Since it may not be valid to emit a fold to zero for vector initializers // check if we can before folding. static SDValue tryFoldToZero(SDLoc DL, const TargetLowering &TLI, EVT VT, SelectionDAG &DAG, bool LegalOperations, bool LegalTypes) { if (!VT.isVector()) return DAG.getConstant(0, VT); if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return DAG.getConstant(0, VT); return SDValue(); } SDValue DAGCombiner::visitSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0.getNode()); ConstantSDNode *N1C = dyn_cast(N1.getNode()); ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr : dyn_cast(N1.getOperand(1).getNode()); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; // fold (sub x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; } // fold (sub x, x) -> 0 // FIXME: Refactor this and xor and other similar operations together. if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); // fold (sub c1, c2) -> c1-c2 if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C); // fold (sub x, c) -> (add x, -c) if (N1C) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, DAG.getConstant(-N1C->getAPIntValue(), VT)); // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) if (N0C && N0C->isAllOnesValue()) return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0); // fold A-(A-B) -> B if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0)) return N1.getOperand(1); // fold (A+B)-A -> B if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1) return N0.getOperand(1); // fold (A+B)-B -> A if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) return N0.getOperand(0); // fold C2-(A+C1) -> (C2-C1)-A if (N1.getOpcode() == ISD::ADD && N0C && N1C1) { SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(), VT); return DAG.getNode(ISD::SUB, SDLoc(N), VT, NewC, N1.getOperand(0)); } // fold ((A+(B+or-C))-B) -> A+or-C if (N0.getOpcode() == ISD::ADD && (N0.getOperand(1).getOpcode() == ISD::SUB || N0.getOperand(1).getOpcode() == ISD::ADD) && N0.getOperand(1).getOperand(0) == N1) return DAG.getNode(N0.getOperand(1).getOpcode(), SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1).getOperand(1)); // fold ((A+(C+B))-B) -> A+C if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD && N0.getOperand(1).getOperand(1) == N1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1).getOperand(0)); // fold ((A-(B-C))-C) -> A-B if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB && N0.getOperand(1).getOperand(1) == N1) return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1).getOperand(0)); // If either operand of a sub is undef, the result is undef if (N0.getOpcode() == ISD::UNDEF) return N0; if (N1.getOpcode() == ISD::UNDEF) return N1; // If the relocation model supports it, consider symbol offsets. if (GlobalAddressSDNode *GA = dyn_cast(N0)) if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { // fold (sub Sym, c) -> Sym-c if (N1C && GA->getOpcode() == ISD::GlobalAddress) return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, GA->getOffset() - (uint64_t)N1C->getSExtValue()); // fold (sub Sym+c1, Sym+c2) -> c1-c2 if (GlobalAddressSDNode *GB = dyn_cast(N1)) if (GA->getGlobal() == GB->getGlobal()) return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(), VT); } // sub X, (sextinreg Y i1) -> add X, (and Y 1) if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { VTSDNode *TN = cast(N1.getOperand(1)); if (TN->getVT() == MVT::i1) { SDLoc DL(N); SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), DAG.getConstant(1, VT)); return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt); } } return SDValue(); } SDValue DAGCombiner::visitSUBC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // If the flag result is dead, turn this into an SUB. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); // fold (subc x, x) -> 0 + no borrow if (N0 == N1) return CombineTo(N, DAG.getConstant(0, VT), DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); // fold (subc x, 0) -> x + no borrow if (N1C && N1C->isNullValue()) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (N0C && N0C->isAllOnesValue()) return CombineTo(N, DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0), DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), MVT::Glue)); return SDValue(); } SDValue DAGCombiner::visitSUBE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // fold (sube x, y, false) -> (subc x, y) if (CarryIn.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold (mul x, undef) -> 0 if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); bool N0IsConst = false; bool N1IsConst = false; APInt ConstValue0, ConstValue1; // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; N0IsConst = isConstantSplatVector(N0.getNode(), ConstValue0); N1IsConst = isConstantSplatVector(N1.getNode(), ConstValue1); } else { N0IsConst = dyn_cast(N0) != nullptr; ConstValue0 = N0IsConst ? (dyn_cast(N0))->getAPIntValue() : APInt(); N1IsConst = dyn_cast(N1) != nullptr; ConstValue1 = N1IsConst ? (dyn_cast(N1))->getAPIntValue() : APInt(); } // fold (mul c1, c2) -> c1*c2 if (N0IsConst && N1IsConst) return DAG.FoldConstantArithmetic(ISD::MUL, VT, N0.getNode(), N1.getNode()); // canonicalize constant to RHS if (N0IsConst && !N1IsConst) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); // fold (mul x, 0) -> 0 if (N1IsConst && ConstValue1 == 0) return N1; // We require a splat of the entire scalar bit width for non-contiguous // bit patterns. bool IsFullSplat = ConstValue1.getBitWidth() == VT.getScalarType().getSizeInBits(); // fold (mul x, 1) -> x if (N1IsConst && ConstValue1 == 1 && IsFullSplat) return N0; // fold (mul x, -1) -> 0-x if (N1IsConst && ConstValue1.isAllOnesValue()) return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), N0); // fold (mul x, (1 << c)) -> x << c if (N1IsConst && ConstValue1.isPowerOf2() && IsFullSplat) return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, DAG.getConstant(ConstValue1.logBase2(), getShiftAmountTy(N0.getValueType()))); // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c if (N1IsConst && (-ConstValue1).isPowerOf2() && IsFullSplat) { unsigned Log2Val = (-ConstValue1).logBase2(); // FIXME: If the input is something that is easily negated (e.g. a // single-use add), we should put the negate there. return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, DAG.getConstant(Log2Val, getShiftAmountTy(N0.getValueType())))); } APInt Val; // (mul (shl X, c1), c2) -> (mul X, c2 << c1) if (N1IsConst && N0.getOpcode() == ISD::SHL && (isConstantSplatVector(N0.getOperand(1).getNode(), Val) || isa(N0.getOperand(1)))) { SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); AddToWorklist(C3.getNode()); return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); } // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one // use. { SDValue Sh(nullptr,0), Y(nullptr,0); // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). if (N0.getOpcode() == ISD::SHL && (isConstantSplatVector(N0.getOperand(1).getNode(), Val) || isa(N0.getOperand(1))) && N0.getNode()->hasOneUse()) { Sh = N0; Y = N1; } else if (N1.getOpcode() == ISD::SHL && isa(N1.getOperand(1)) && N1.getNode()->hasOneUse()) { Sh = N1; Y = N0; } if (Sh.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y); return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1)); } } // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) if (N1IsConst && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && (isConstantSplatVector(N0.getOperand(1).getNode(), Val) || isa(N0.getOperand(1)))) return DAG.getNode(ISD::ADD, SDLoc(N), VT, DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1), DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1)); // reassociate mul SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1); if (RMUL.getNode()) return RMUL; return SDValue(); } SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); EVT VT = N->getValueType(0); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // fold (sdiv c1, c2) -> c1/c2 if (N0C && N1C && !N1C->isNullValue()) return DAG.FoldConstantArithmetic(ISD::SDIV, VT, N0C, N1C); // fold (sdiv X, 1) -> X if (N1C && N1C->getAPIntValue() == 1LL) return N0; // fold (sdiv X, -1) -> 0-X if (N1C && N1C->isAllOnesValue()) return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), N0); // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 if (!VT.isVector()) { if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UDIV, SDLoc(N), N1.getValueType(), N0, N1); } // fold (sdiv X, pow2) -> simple ops after legalize if (N1C && !N1C->isNullValue() && (N1C->getAPIntValue().isPowerOf2() || (-N1C->getAPIntValue()).isPowerOf2())) { // If dividing by powers of two is cheap, then don't perform the following // fold. if (TLI.isPow2SDivCheap()) return SDValue(); // Target-specific implementation of sdiv x, pow2. SDValue Res = BuildSDIVPow2(N); if (Res.getNode()) return Res; unsigned lg2 = N1C->getAPIntValue().countTrailingZeros(); // Splat the sign bit into the register SDValue SGN = DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, DAG.getConstant(VT.getScalarSizeInBits() - 1, getShiftAmountTy(N0.getValueType()))); AddToWorklist(SGN.getNode()); // Add (N0 < 0) ? abs2 - 1 : 0; SDValue SRL = DAG.getNode(ISD::SRL, SDLoc(N), VT, SGN, DAG.getConstant(VT.getScalarSizeInBits() - lg2, getShiftAmountTy(SGN.getValueType()))); SDValue ADD = DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, SRL); AddToWorklist(SRL.getNode()); AddToWorklist(ADD.getNode()); // Divide by pow2 SDValue SRA = DAG.getNode(ISD::SRA, SDLoc(N), VT, ADD, DAG.getConstant(lg2, getShiftAmountTy(ADD.getValueType()))); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (N1C->getAPIntValue().isNonNegative()) return SRA; AddToWorklist(SRA.getNode()); return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), SRA); } // if integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. if (N1C && !TLI.isIntDivCheap()) { SDValue Op = BuildSDIV(N); if (Op.getNode()) return Op; } // undef / X -> 0 if (N0.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // X / undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; return SDValue(); } SDValue DAGCombiner::visitUDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); EVT VT = N->getValueType(0); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // fold (udiv c1, c2) -> c1/c2 if (N0C && N1C && !N1C->isNullValue()) return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C); // fold (udiv x, (1 << c)) -> x >>u c if (N1C && N1C->getAPIntValue().isPowerOf2()) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, DAG.getConstant(N1C->getAPIntValue().logBase2(), getShiftAmountTy(N0.getValueType()))); // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (N1.getOpcode() == ISD::SHL) { if (ConstantSDNode *SHC = dyn_cast(N1.getOperand(0))) { if (SHC->getAPIntValue().isPowerOf2()) { EVT ADDVT = N1.getOperand(1).getValueType(); SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N), ADDVT, N1.getOperand(1), DAG.getConstant(SHC->getAPIntValue() .logBase2(), ADDVT)); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, Add); } } } // fold (udiv x, c) -> alternate if (N1C && !TLI.isIntDivCheap()) { SDValue Op = BuildUDIV(N); if (Op.getNode()) return Op; } // undef / X -> 0 if (N0.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // X / undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; return SDValue(); } SDValue DAGCombiner::visitSREM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); EVT VT = N->getValueType(0); // fold (srem c1, c2) -> c1%c2 if (N0C && N1C && !N1C->isNullValue()) return DAG.FoldConstantArithmetic(ISD::SREM, VT, N0C, N1C); // If we know the sign bits of both operands are zero, strength reduce to a // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 if (!VT.isVector()) { if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, SDLoc(N), VT, N0, N1); } // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. if (N1C && !N1C->isNullValue()) { SDValue Div = DAG.getNode(ISD::SDIV, SDLoc(N), VT, N0, N1); AddToWorklist(Div.getNode()); SDValue OptimizedDiv = combine(Div.getNode()); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, OptimizedDiv, N1); SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul); AddToWorklist(Mul.getNode()); return Sub; } } // undef % X -> 0 if (N0.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // X % undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; return SDValue(); } SDValue DAGCombiner::visitUREM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); EVT VT = N->getValueType(0); // fold (urem c1, c2) -> c1%c2 if (N0C && N1C && !N1C->isNullValue()) return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C); // fold (urem x, pow2) -> (and x, pow2-1) if (N1C && !N1C->isNullValue() && N1C->getAPIntValue().isPowerOf2()) return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, DAG.getConstant(N1C->getAPIntValue()-1,VT)); // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) if (N1.getOpcode() == ISD::SHL) { if (ConstantSDNode *SHC = dyn_cast(N1.getOperand(0))) { if (SHC->getAPIntValue().isPowerOf2()) { SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT)); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, Add); } } } // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. if (N1C && !N1C->isNullValue()) { SDValue Div = DAG.getNode(ISD::UDIV, SDLoc(N), VT, N0, N1); AddToWorklist(Div.getNode()); SDValue OptimizedDiv = combine(Div.getNode()); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, OptimizedDiv, N1); SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul); AddToWorklist(Mul.getNode()); return Sub; } } // undef % X -> 0 if (N0.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // X % undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; return SDValue(); } SDValue DAGCombiner::visitMULHS(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); // fold (mulhs x, 0) -> 0 if (N1C && N1C->isNullValue()) return N1; // fold (mulhs x, 1) -> (sra x, size(x)-1) if (N1C && N1C->getAPIntValue() == 1) return DAG.getNode(ISD::SRA, SDLoc(N), N0.getValueType(), N0, DAG.getConstant(N0.getValueType().getSizeInBits() - 1, getShiftAmountTy(N0.getValueType()))); // fold (mulhs x, undef) -> 0 if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // If the type twice as wide is legal, transform the mulhs to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, DAG.getConstant(SimpleSize, getShiftAmountTy(N1.getValueType()))); return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); } } return SDValue(); } SDValue DAGCombiner::visitMULHU(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); // fold (mulhu x, 0) -> 0 if (N1C && N1C->isNullValue()) return N1; // fold (mulhu x, 1) -> 0 if (N1C && N1C->getAPIntValue() == 1) return DAG.getConstant(0, N0.getValueType()); // fold (mulhu x, undef) -> 0 if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, DAG.getConstant(SimpleSize, getShiftAmountTy(N1.getValueType()))); return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); } } return SDValue(); } /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp) { // If the high half is not needed, just compute the low half. bool HiExists = N->hasAnyUseOfValue(1); if (!HiExists && (!LegalOperations || TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); return CombineTo(N, Res, Res); } // If the low half is not needed, just compute the high half. bool LoExists = N->hasAnyUseOfValue(0); if (!LoExists && (!LegalOperations || TLI.isOperationLegal(HiOp, N->getValueType(1)))) { SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); return CombineTo(N, Res, Res); } // If both halves are used, return as it is. if (LoExists && HiExists) return SDValue(); // If the two computed results can be simplified separately, separate them. if (LoExists) { SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); AddToWorklist(Lo.getNode()); SDValue LoOpt = combine(Lo.getNode()); if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && (!LegalOperations || TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType()))) return CombineTo(N, LoOpt, LoOpt); } if (HiExists) { SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); AddToWorklist(Hi.getNode()); SDValue HiOpt = combine(Hi.getNode()); if (HiOpt.getNode() && HiOpt != Hi && (!LegalOperations || TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType()))) return CombineTo(N, HiOpt, HiOpt); } return SDValue(); } SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS); if (Res.getNode()) return Res; EVT VT = N->getValueType(0); SDLoc DL(N); // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, DAG.getConstant(SimpleSize, getShiftAmountTy(Lo.getValueType()))); Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); // Compute the low part as N0. Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); return CombineTo(N, Lo, Hi); } } return SDValue(); } SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU); if (Res.getNode()) return Res; EVT VT = N->getValueType(0); SDLoc DL(N); // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, DAG.getConstant(SimpleSize, getShiftAmountTy(Lo.getValueType()))); Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); // Compute the low part as N0. Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); return CombineTo(N, Lo, Hi); } } return SDValue(); } SDValue DAGCombiner::visitSMULO(SDNode *N) { // (smulo x, 2) -> (saddo x, x) if (ConstantSDNode *C2 = dyn_cast(N->getOperand(1))) if (C2->getAPIntValue() == 2) return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitUMULO(SDNode *N) { // (umulo x, 2) -> (uaddo x, x) if (ConstantSDNode *C2 = dyn_cast(N->getOperand(1))) if (C2->getAPIntValue() == 2) return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitSDIVREM(SDNode *N) { SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM); if (Res.getNode()) return Res; return SDValue(); } SDValue DAGCombiner::visitUDIVREM(SDNode *N) { SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM); if (Res.getNode()) return Res; return SDValue(); } /// If this is a binary operator with two operands of the same opcode, try to /// simplify it. SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); EVT VT = N0.getValueType(); assert(N0.getOpcode() == N1.getOpcode() && "Bad input!"); // Bail early if none of these transforms apply. if (N0.getNode()->getNumOperands() == 0) return SDValue(); // For each of OP in AND/OR/XOR: // fold (OP (zext x), (zext y)) -> (zext (OP x, y)) // fold (OP (sext x), (sext y)) -> (sext (OP x, y)) // fold (OP (aext x), (aext y)) -> (aext (OP x, y)) // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y)) // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free) // // do not sink logical op inside of a vector extend, since it may combine // into a vsetcc. EVT Op0VT = N0.getOperand(0).getValueType(); if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::BSWAP || // Avoid infinite looping with PromoteIntBinOp. (N0.getOpcode() == ISD::ANY_EXTEND && (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) || (N0.getOpcode() == ISD::TRUNCATE && (!TLI.isZExtFree(VT, Op0VT) || !TLI.isTruncateFree(Op0VT, VT)) && TLI.isTypeLegal(Op0VT))) && !VT.isVector() && Op0VT == N1.getOperand(0).getValueType() && (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode); } // For each of OP in SHL/SRL/SRA/AND... // fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z) // fold (or (OP x, z), (OP y, z)) -> (OP (or x, y), z) // fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) && N0.getOperand(1) == N1.getOperand(1)) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode, N0.getOperand(1)); } // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) // Only perform this optimization after type legalization and before // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and // we don't want to undo this promotion. // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper // on scalars. if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR) && Level == AfterLegalizeTypes) { SDValue In0 = N0.getOperand(0); SDValue In1 = N1.getOperand(0); EVT In0Ty = In0.getValueType(); EVT In1Ty = In1.getValueType(); SDLoc DL(N); // If both incoming values are integers, and the original types are the // same. if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) { SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1); SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op); AddToWorklist(Op.getNode()); return BC; } } // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) // If both shuffles use the same mask, and both shuffle within a single // vector, then it is worthwhile to move the swizzle after the operation. // The type-legalizer generates this pattern when loading illegal // vector types from memory. In many cases this allows additional shuffle // optimizations. // There are other cases where moving the shuffle after the xor/and/or // is profitable even if shuffles don't perform a swizzle. // If both shuffles use the same mask, and both shuffles have the same first // or second operand, then it might still be profitable to move the shuffle // after the xor/and/or operation. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { ShuffleVectorSDNode *SVN0 = cast(N0); ShuffleVectorSDNode *SVN1 = cast(N1); assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && "Inputs to shuffles are not the same type"); // Check that both shuffles use the same mask. The masks are known to be of // the same length because the result vector type is the same. // Check also that shuffles have only one use to avoid introducing extra // instructions. if (SVN0->hasOneUse() && SVN1->hasOneUse() && SVN0->getMask().equals(SVN1->getMask())) { SDValue ShOp = N0->getOperand(1); // Don't try to fold this node if it requires introducing a // build vector of all zeros that might be illegal at this stage. if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) { if (!LegalTypes) ShOp = DAG.getConstant(0, VT); else ShOp = SDValue(); } // (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C) // (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C) // (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0) if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(0), N1->getOperand(0)); AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp, &SVN0->getMask()[0]); } // Don't try to fold this node if it requires introducing a // build vector of all zeros that might be illegal at this stage. ShOp = N0->getOperand(0); if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) { if (!LegalTypes) ShOp = DAG.getConstant(0, VT); else ShOp = SDValue(); } // (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B)) // (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B)) // (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B)) if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(1), N1->getOperand(1)); AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode, &SVN0->getMask()[0]); } } } return SDValue(); } SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue LL, LR, RL, RR, CC0, CC1; ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N1.getValueType(); unsigned BitWidth = VT.getScalarType().getSizeInBits(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; // fold (and x, 0) -> 0, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) // do not return N0, because undef node may exist in N0 return DAG.getConstant( APInt::getNullValue( N0.getValueType().getScalarType().getSizeInBits()), N0.getValueType()); if (ISD::isBuildVectorAllZeros(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getConstant( APInt::getNullValue( N1.getValueType().getScalarType().getSizeInBits()), N1.getValueType()); // fold (and x, -1) -> x, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; if (ISD::isBuildVectorAllOnes(N1.getNode())) return N0; } // fold (and x, undef) -> 0 if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // fold (and c1, c2) -> c1&c2 if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C); // canonicalize constant to RHS if (N0C && !N1C) return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); // fold (and x, -1) -> x if (N1C && N1C->isAllOnesValue()) return N0; // if (and x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, VT); // reassociate and SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1); if (RAND.getNode()) return RAND; // fold (and (or x, C), D) -> D if (C & D) == D if (N1C && N0.getOpcode() == ISD::OR) if (ConstantSDNode *ORI = dyn_cast(N0.getOperand(1))) if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue()) return N1; // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N0Op0 = N0.getOperand(0); APInt Mask = ~N1C->getAPIntValue(); Mask = Mask.trunc(N0Op0.getValueSizeInBits()); if (DAG.MaskedValueIsZero(N0Op0, Mask)) { SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N0.getValueType(), N0Op0); // Replace uses of the AND with uses of the Zero extend node. CombineTo(N, Zext); // We actually want to replace all uses of the any_extend with the // zero_extend, to avoid duplicating things. This will later cause this // AND to be folded. CombineTo(N0.getNode(), Zext); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must // already be zero by virtue of the width of the base type of the load. // // the 'X' node here can either be nothing or an extract_vector_elt to catch // more cases. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && N0.getOperand(0).getOpcode() == ISD::LOAD) || N0.getOpcode() == ISD::LOAD) { LoadSDNode *Load = cast( (N0.getOpcode() == ISD::LOAD) ? N0 : N0.getOperand(0) ); // Get the constant (if applicable) the zero'th operand is being ANDed with. // This can be a pure constant or a vector splat, in which case we treat the // vector as a scalar and use the splat value. APInt Constant = APInt::getNullValue(1); if (const ConstantSDNode *C = dyn_cast(N1)) { Constant = C->getAPIntValue(); } else if (BuildVectorSDNode *Vector = dyn_cast(N1)) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs); if (IsSplat) { // Undef bits can contribute to a possible optimisation if set, so // set them. SplatValue |= SplatUndef; // The splat value may be something like "0x00FFFFFF", which means 0 for // the first vector value and FF for the rest, repeating. We need a mask // that will apply equally to all members of the vector, so AND all the // lanes of the constant together. EVT VT = Vector->getValueType(0); unsigned BitWidth = VT.getVectorElementType().getSizeInBits(); // If the splat value has been compressed to a bitlength lower // than the size of the vector lane, we need to re-expand it to // the lane size. if (BitWidth > SplatBitSize) for (SplatValue = SplatValue.zextOrTrunc(BitWidth); SplatBitSize < BitWidth; SplatBitSize = SplatBitSize * 2) SplatValue |= SplatValue.shl(SplatBitSize); Constant = APInt::getAllOnesValue(BitWidth); for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i) Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth); } } // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is // actually legal and isn't going to get expanded, else this is a false // optimisation. bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, Load->getValueType(0), Load->getMemoryVT()); // Resize the constant to the same size as the original memory access before // extension. If it is still the AllOnesValue then this AND is completely // unneeded. Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarType().getSizeInBits()); bool B; switch (Load->getExtensionType()) { default: B = false; break; case ISD::EXTLOAD: B = CanZextLoadProfitably; break; case ISD::ZEXTLOAD: case ISD::NON_EXTLOAD: B = true; break; } if (B && Constant.isAllOnesValue()) { // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to // preserve semantics once we get rid of the AND. SDValue NewLoad(Load, 0); if (Load->getExtensionType() == ISD::EXTLOAD) { NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), SDLoc(Load), Load->getChain(), Load->getBasePtr(), Load->getOffset(), Load->getMemoryVT(), Load->getMemOperand()); // Replace uses of the EXTLOAD with the new ZEXTLOAD. if (Load->getNumValues() == 3) { // PRE/POST_INC loads have 3 values. SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), NewLoad.getValue(2) }; CombineTo(Load, To, 3, true); } else { CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); } } // Fold the AND away, taking care not to fold to the old load node if we // replaced it. CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (and (setcc x), (setcc y)) -> (setcc (and x, y)) if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ ISD::CondCode Op0 = cast(CC0)->get(); ISD::CondCode Op1 = cast(CC1)->get(); if (LR == RR && isa(LR) && Op0 == Op1 && LL.getValueType().isInteger()) { // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0) if (cast(LR)->isNullValue() && Op1 == ISD::SETEQ) { SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), LR.getValueType(), LL, RL); AddToWorklist(ORNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1); } // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1) if (cast(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) { SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0), LR.getValueType(), LL, RL); AddToWorklist(ANDNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1); } // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1) if (cast(LR)->isAllOnesValue() && Op1 == ISD::SETGT) { SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), LR.getValueType(), LL, RL); AddToWorklist(ORNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1); } } // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2) if (LL == RL && isa(LR) && isa(RR) && Op0 == Op1 && LL.getValueType().isInteger() && Op0 == ISD::SETNE && ((cast(LR)->isNullValue() && cast(RR)->isAllOnesValue()) || (cast(LR)->isAllOnesValue() && cast(RR)->isNullValue()))) { SDValue ADDNode = DAG.getNode(ISD::ADD, SDLoc(N0), LL.getValueType(), LL, DAG.getConstant(1, LL.getValueType())); AddToWorklist(ADDNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ADDNode, DAG.getConstant(2, LL.getValueType()), ISD::SETUGE); } // canonicalize equivalent to ll == rl if (LL == RR && LR == RL) { Op1 = ISD::getSetCCSwappedOperands(Op1); std::swap(RL, RR); } if (LL == RL && LR == RR) { bool isInteger = LL.getValueType().isInteger(); ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger); if (Result != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(N0.getSimpleValueType()))))) return DAG.getSetCC(SDLoc(N), N0.getValueType(), LL, LR, Result); } } // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) if (N0.getOpcode() == N1.getOpcode()) { SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N); if (Tmp.getNode()) return Tmp; } // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (zext_inreg (extload x)) -> (zextload x) if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits(); if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarType().getSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits(); if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarType().getSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (and (load x), 255) -> (zextload x, i8) // fold (and (extload x, i16), 255) -> (zextload x, i8) // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) if (N1C && (N0.getOpcode() == ISD::LOAD || (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND; LoadSDNode *LN0 = HasAnyExt ? cast(N0.getOperand(0)) : cast(N0); if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) { uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits(); if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){ EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT LoadedVT = LN0->getMemoryVT(); EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; if (ExtVT == LoadedVT && (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), LN0->getBasePtr(), ExtVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(LN0, NewLoad, NewLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // Do not change the width of a volatile load. // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() && (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { EVT PtrType = LN0->getOperand(1).getValueType(); unsigned Alignment = LN0->getAlignment(); SDValue NewPtr = LN0->getBasePtr(); // For big endian targets, we need to add an offset to the pointer // to load the correct bytes. For little endian systems, we merely // need to read fewer bytes from the same pointer. if (TLI.isBigEndian()) { unsigned LVTStoreBytes = LoadedVT.getStoreSize(); unsigned EVTStoreBytes = ExtVT.getStoreSize(); unsigned PtrOff = LVTStoreBytes - EVTStoreBytes; NewPtr = DAG.getNode(ISD::ADD, SDLoc(LN0), PtrType, NewPtr, DAG.getConstant(PtrOff, PtrType)); Alignment = MinAlign(Alignment, PtrOff); } AddToWorklist(NewPtr.getNode()); SDValue Load = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr, LN0->getPointerInfo(), ExtVT, LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), Alignment, LN0->getAAInfo()); AddToWorklist(N); CombineTo(LN0, Load, Load.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } } if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64) { if (ConstantSDNode *ADDI = dyn_cast(N0.getOperand(1))) { APInt ADDC = ADDI->getAPIntValue(); if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) { // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal // immediate for an add, but it is legal if its top c2 bits are set, // transform the ADD so the immediate doesn't need to be materialized // in a register. if (ConstantSDNode *SRLI = dyn_cast(N1.getOperand(1))) { APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), SRLI->getZExtValue()); if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { ADDC |= Mask; if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { SDValue NewAdd = DAG.getNode(ISD::ADD, SDLoc(N0), VT, N0.getOperand(0), DAG.getConstant(ADDC, VT)); CombineTo(N0.getNode(), NewAdd); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } } } } // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false); if (BSwap.getNode()) return BSwap; } return SDValue(); } /// Match (a >> 8) | (a << 8) as (bswap a) >> 16. SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits) { if (!LegalOperations) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) return SDValue(); if (!TLI.isOperationLegal(ISD::BSWAP, VT)) return SDValue(); // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00) bool LookPassAnd0 = false; bool LookPassAnd1 = false; if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) std::swap(N0, N1); if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() == ISD::AND) { if (!N0.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); if (!N01C || N01C->getZExtValue() != 0xFF00) return SDValue(); N0 = N0.getOperand(0); LookPassAnd0 = true; } if (N1.getOpcode() == ISD::AND) { if (!N1.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C || N11C->getZExtValue() != 0xFF) return SDValue(); N1 = N1.getOperand(0); LookPassAnd1 = true; } if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N01C || !N11C) return SDValue(); if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8) return SDValue(); // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8) SDValue N00 = N0->getOperand(0); if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) { if (!N00.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N001C = dyn_cast(N00.getOperand(1)); if (!N001C || N001C->getZExtValue() != 0xFF) return SDValue(); N00 = N00.getOperand(0); LookPassAnd0 = true; } SDValue N10 = N1->getOperand(0); if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) { if (!N10.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N101C = dyn_cast(N10.getOperand(1)); if (!N101C || N101C->getZExtValue() != 0xFF00) return SDValue(); N10 = N10.getOperand(0); LookPassAnd1 = true; } if (N00 != N10) return SDValue(); // Make sure everything beyond the low halfword gets set to zero since the SRL // 16 will clear the top bits. unsigned OpSizeInBits = VT.getSizeInBits(); if (DemandHighBits && OpSizeInBits > 16) { // If the left-shift isn't masked out then the only way this is a bswap is // if all bits beyond the low 8 are 0. In that case the entire pattern // reduces to a left shift anyway: leave it for other parts of the combiner. if (!LookPassAnd0) return SDValue(); // However, if the right shift isn't masked out then it might be because // it's not needed. See if we can spot that too. if (!LookPassAnd1 && !DAG.MaskedValueIsZero( N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16))) return SDValue(); } SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00); if (OpSizeInBits > 16) Res = DAG.getNode(ISD::SRL, SDLoc(N), VT, Res, DAG.getConstant(OpSizeInBits-16, getShiftAmountTy(VT))); return Res; } /// Return true if the specified node is an element that makes up a 32-bit /// packed halfword byteswap. /// ((x & 0x000000ff) << 8) | /// ((x & 0x0000ff00) >> 8) | /// ((x & 0x00ff0000) << 8) | /// ((x & 0xff000000) >> 8) static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { if (!N.getNode()->hasOneUse()) return false; unsigned Opc = N.getOpcode(); if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) return false; ConstantSDNode *N1C = dyn_cast(N.getOperand(1)); if (!N1C) return false; unsigned Num; switch (N1C->getZExtValue()) { default: return false; case 0xFF: Num = 0; break; case 0xFF00: Num = 1; break; case 0xFF0000: Num = 2; break; case 0xFF000000: Num = 3; break; } // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). SDValue N0 = N.getOperand(0); if (Opc == ISD::AND) { if (Num == 0 || Num == 2) { // (x >> 8) & 0xff // (x >> 8) & 0xff0000 if (N0.getOpcode() != ISD::SRL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } else { // (x << 8) & 0xff00 // (x << 8) & 0xff000000 if (N0.getOpcode() != ISD::SHL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } } else if (Opc == ISD::SHL) { // (x & 0xff) << 8 // (x & 0xff0000) << 8 if (Num != 0 && Num != 2) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } else { // Opc == ISD::SRL // (x & 0xff00) >> 8 // (x & 0xff000000) >> 8 if (Num != 1 && Num != 3) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } if (Parts[Num]) return false; Parts[Num] = N0.getOperand(0).getNode(); return true; } /// Match a 32-bit packed halfword bswap. That is /// ((x & 0x000000ff) << 8) | /// ((x & 0x0000ff00) >> 8) | /// ((x & 0x00ff0000) << 8) | /// ((x & 0xff000000) >> 8) /// => (rotl (bswap x), 16) SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { if (!LegalOperations) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); if (!TLI.isOperationLegal(ISD::BSWAP, VT)) return SDValue(); // Look for either // (or (or (and), (and)), (or (and), (and))) // (or (or (or (and), (and)), (and)), (and)) if (N0.getOpcode() != ISD::OR) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDNode *Parts[4] = {}; if (N1.getOpcode() == ISD::OR && N00.getNumOperands() == 2 && N01.getNumOperands() == 2) { // (or (or (and), (and)), (or (and), (and))) SDValue N000 = N00.getOperand(0); if (!isBSwapHWordElement(N000, Parts)) return SDValue(); SDValue N001 = N00.getOperand(1); if (!isBSwapHWordElement(N001, Parts)) return SDValue(); SDValue N010 = N01.getOperand(0); if (!isBSwapHWordElement(N010, Parts)) return SDValue(); SDValue N011 = N01.getOperand(1); if (!isBSwapHWordElement(N011, Parts)) return SDValue(); } else { // (or (or (or (and), (and)), (and)), (and)) if (!isBSwapHWordElement(N1, Parts)) return SDValue(); if (!isBSwapHWordElement(N01, Parts)) return SDValue(); if (N00.getOpcode() != ISD::OR) return SDValue(); SDValue N000 = N00.getOperand(0); if (!isBSwapHWordElement(N000, Parts)) return SDValue(); SDValue N001 = N00.getOperand(1); if (!isBSwapHWordElement(N001, Parts)) return SDValue(); } // Make sure the parts are all coming from the same node. if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3]) return SDValue(); SDValue BSwap = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, SDValue(Parts[0],0)); // Result of the bswap should be rotated by 16. If it's not legal, then // do (x << 16) | (x >> 16). SDValue ShAmt = DAG.getConstant(16, getShiftAmountTy(VT)); if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT)) return DAG.getNode(ISD::ROTL, SDLoc(N), VT, BSwap, ShAmt); if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) return DAG.getNode(ISD::ROTR, SDLoc(N), VT, BSwap, ShAmt); return DAG.getNode(ISD::OR, SDLoc(N), VT, DAG.getNode(ISD::SHL, SDLoc(N), VT, BSwap, ShAmt), DAG.getNode(ISD::SRL, SDLoc(N), VT, BSwap, ShAmt)); } SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue LL, LR, RL, RR, CC0, CC1; ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N1.getValueType(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; // fold (or x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; // fold (or x, -1) -> -1, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) // do not return N0, because undef node may exist in N0 return DAG.getConstant( APInt::getAllOnesValue( N0.getValueType().getScalarType().getSizeInBits()), N0.getValueType()); if (ISD::isBuildVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getConstant( APInt::getAllOnesValue( N1.getValueType().getScalarType().getSizeInBits()), N1.getValueType()); // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1) // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2) // Do this only if the resulting shuffle is legal. if (isa(N0) && isa(N1) && // Avoid folding a node with illegal type. TLI.isTypeLegal(VT) && N0->getOperand(1) == N1->getOperand(1) && ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) { bool CanFold = true; unsigned NumElts = VT.getVectorNumElements(); const ShuffleVectorSDNode *SV0 = cast(N0); const ShuffleVectorSDNode *SV1 = cast(N1); // We construct two shuffle masks: // - Mask1 is a shuffle mask for a shuffle with N0 as the first operand // and N1 as the second operand. // - Mask2 is a shuffle mask for a shuffle with N1 as the first operand // and N0 as the second operand. // We do this because OR is commutable and therefore there might be // two ways to fold this node into a shuffle. SmallVector Mask1; SmallVector Mask2; for (unsigned i = 0; i != NumElts && CanFold; ++i) { int M0 = SV0->getMaskElt(i); int M1 = SV1->getMaskElt(i); // Both shuffle indexes are undef. Propagate Undef. if (M0 < 0 && M1 < 0) { Mask1.push_back(M0); Mask2.push_back(M0); continue; } if (M0 < 0 || M1 < 0 || (M0 < (int)NumElts && M1 < (int)NumElts) || (M0 >= (int)NumElts && M1 >= (int)NumElts)) { CanFold = false; break; } Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts); Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts); } if (CanFold) { // Fold this sequence only if the resulting shuffle is 'legal'. if (TLI.isShuffleMaskLegal(Mask1, VT)) return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1->getOperand(0), &Mask1[0]); if (TLI.isShuffleMaskLegal(Mask2, VT)) return DAG.getVectorShuffle(VT, SDLoc(N), N1->getOperand(0), N0->getOperand(0), &Mask2[0]); } } } // fold (or x, undef) -> -1 if (!LegalOperations && (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)) { EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT; return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT); } // fold (or c1, c2) -> c1|c2 if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C); // canonicalize constant to RHS if (N0C && !N1C) return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); // fold (or x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // fold (or x, -1) -> -1 if (N1C && N1C->isAllOnesValue()) return N1; // fold (or x, c) -> c iff (x & ~c) == 0 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) SDValue BSwap = MatchBSwapHWord(N, N0, N1); if (BSwap.getNode()) return BSwap; BSwap = MatchBSwapHWordLow(N, N0, N1); if (BSwap.getNode()) return BSwap; // reassociate or SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1); if (ROR.getNode()) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) == 0. if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && isa(N0.getOperand(1))) { ConstantSDNode *C1 = cast(N0.getOperand(1)); if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) { if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1)) return DAG.getNode( ISD::AND, SDLoc(N), VT, DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); return SDValue(); } } // fold (or (setcc x), (setcc y)) -> (setcc (or x, y)) if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ ISD::CondCode Op0 = cast(CC0)->get(); ISD::CondCode Op1 = cast(CC1)->get(); if (LR == RR && isa(LR) && Op0 == Op1 && LL.getValueType().isInteger()) { // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0) // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0) if (cast(LR)->isNullValue() && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) { SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR), LR.getValueType(), LL, RL); AddToWorklist(ORNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1); } // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1) // fold (or (setgt X, -1), (setgt Y -1)) -> (setgt (and X, Y), -1) if (cast(LR)->isAllOnesValue() && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) { SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR), LR.getValueType(), LL, RL); AddToWorklist(ANDNode.getNode()); return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1); } } // canonicalize equivalent to ll == rl if (LL == RR && LR == RL) { Op1 = ISD::getSetCCSwappedOperands(Op1); std::swap(RL, RR); } if (LL == RL && LR == RR) { bool isInteger = LL.getValueType().isInteger(); ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger); if (Result != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(N0.getValueType()))))) return DAG.getSetCC(SDLoc(N), N0.getValueType(), LL, LR, Result); } } // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) { SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N); if (Tmp.getNode()) return Tmp; } // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant && N1.getOperand(1).getOpcode() == ISD::Constant && // Don't increase # computations. (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { // We can only do this xform if we know that bits from X that are set in C2 // but not in C1 are already zero. Likewise for Y. const APInt &LHSMask = cast(N0.getOperand(1))->getAPIntValue(); const APInt &RHSMask = cast(N1.getOperand(1))->getAPIntValue(); if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) && DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1.getOperand(0)); return DAG.getNode(ISD::AND, SDLoc(N), VT, X, DAG.getConstant(LHSMask | RHSMask, VT)); } } // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } /// Match "(X shl/srl V1) & V2" where V2 may not be present. static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { if (Op.getOpcode() == ISD::AND) { if (isa(Op.getOperand(1))) { Mask = Op.getOperand(1); Op = Op.getOperand(0); } else { return false; } } if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { Shift = Op; return true; } return false; } // Return true if we can prove that, whenever Neg and Pos are both in the // range [0, OpSize), Neg == (Pos == 0 ? 0 : OpSize - Pos). This means that // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: // // (or (shift1 X, Neg), (shift2 X, Pos)) // // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate // in direction shift1 by Neg. The range [0, OpSize) means that we only need // to consider shift amounts with defined behavior. static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned OpSize) { // If OpSize is a power of 2 then: // // (a) (Pos == 0 ? 0 : OpSize - Pos) == (OpSize - Pos) & (OpSize - 1) // (b) Neg == Neg & (OpSize - 1) whenever Neg is in [0, OpSize). // // So if OpSize is a power of 2 and Neg is (and Neg', OpSize-1), we check // for the stronger condition: // // Neg & (OpSize - 1) == (OpSize - Pos) & (OpSize - 1) [A] // // for all Neg and Pos. Since Neg & (OpSize - 1) == Neg' & (OpSize - 1) // we can just replace Neg with Neg' for the rest of the function. // // In other cases we check for the even stronger condition: // // Neg == OpSize - Pos [B] // // for all Neg and Pos. Note that the (or ...) then invokes undefined // behavior if Pos == 0 (and consequently Neg == OpSize). // // We could actually use [A] whenever OpSize is a power of 2, but the // only extra cases that it would match are those uninteresting ones // where Neg and Pos are never in range at the same time. E.g. for // OpSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) // as well as (sub 32, Pos), but: // // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) // // always invokes undefined behavior for 32-bit X. // // Below, Mask == OpSize - 1 when using [A] and is all-ones otherwise. unsigned MaskLoBits = 0; if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(OpSize) && Neg.getOperand(1).getOpcode() == ISD::Constant && cast(Neg.getOperand(1))->getAPIntValue() == OpSize - 1) { Neg = Neg.getOperand(0); MaskLoBits = Log2_64(OpSize); } // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. if (Neg.getOpcode() != ISD::SUB) return 0; ConstantSDNode *NegC = dyn_cast(Neg.getOperand(0)); if (!NegC) return 0; SDValue NegOp1 = Neg.getOperand(1); // On the RHS of [A], if Pos is Pos' & (OpSize - 1), just replace Pos with // Pos'. The truncation is redundant for the purpose of the equality. if (MaskLoBits && Pos.getOpcode() == ISD::AND && Pos.getOperand(1).getOpcode() == ISD::Constant && cast(Pos.getOperand(1))->getAPIntValue() == OpSize - 1) Pos = Pos.getOperand(0); // The condition we need is now: // // (NegC - NegOp1) & Mask == (OpSize - Pos) & Mask // // If NegOp1 == Pos then we need: // // OpSize & Mask == NegC & Mask // // (because "x & Mask" is a truncation and distributes through subtraction). APInt Width; if (Pos == NegOp1) Width = NegC->getAPIntValue(); // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. // Then the condition we want to prove becomes: // // (NegC - NegOp1) & Mask == (OpSize - (NegOp1 + PosC)) & Mask // // which, again because "x & Mask" is a truncation, becomes: // // NegC & Mask == (OpSize - PosC) & Mask // OpSize & Mask == (NegC + PosC) & Mask else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1 && Pos.getOperand(1).getOpcode() == ISD::Constant) Width = (cast(Pos.getOperand(1))->getAPIntValue() + NegC->getAPIntValue()); else return false; // Now we just need to check that OpSize & Mask == Width & Mask. if (MaskLoBits) // Opsize & Mask is 0 since Mask is Opsize - 1. return Width.getLoBits(MaskLoBits) == 0; return Width == OpSize; } // A subroutine of MatchRotate used once we have found an OR of two opposite // shifts of Shifted. If Neg == - Pos then the OR reduces // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the // former being preferred if supported. InnerPos and InnerNeg are Pos and // Neg with outer conversions stripped away. SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, SDLoc DL) { // fold (or (shl x, (*ext y)), // (srl x, (*ext (sub 32, y)))) -> // (rotl x, y) or (rotr x, (sub 32, y)) // // fold (or (shl x, (*ext (sub 32, y))), // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); if (matchRotateSub(InnerPos, InnerNeg, VT.getSizeInBits())) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg).getNode(); } return nullptr; } // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) { // Must be a legal type. Expanded 'n promoted things won't work with rotates. EVT VT = LHS.getValueType(); if (!TLI.isTypeLegal(VT)) return nullptr; // The target must have at least one rotate flavor. bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT); bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT); if (!HasROTL && !HasROTR) return nullptr; // Match "(X shl/srl V1) & V2" where V2 may not be present. SDValue LHSShift; // The shift. SDValue LHSMask; // AND value if any. if (!MatchRotateHalf(LHS, LHSShift, LHSMask)) return nullptr; // Not part of a rotate. SDValue RHSShift; // The shift. SDValue RHSMask; // AND value if any. if (!MatchRotateHalf(RHS, RHSShift, RHSMask)) return nullptr; // Not part of a rotate. if (LHSShift.getOperand(0) != RHSShift.getOperand(0)) return nullptr; // Not shifting the same value. if (LHSShift.getOpcode() == RHSShift.getOpcode()) return nullptr; // Shifts must disagree. // Canonicalize shl to left side in a shl/srl pair. if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); std::swap(LHSShift, RHSShift); std::swap(LHSMask , RHSMask ); } unsigned OpSizeInBits = VT.getSizeInBits(); SDValue LHSShiftArg = LHSShift.getOperand(0); SDValue LHSShiftAmt = LHSShift.getOperand(1); SDValue RHSShiftArg = RHSShift.getOperand(0); SDValue RHSShiftAmt = RHSShift.getOperand(1); // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) if (LHSShiftAmt.getOpcode() == ISD::Constant && RHSShiftAmt.getOpcode() == ISD::Constant) { uint64_t LShVal = cast(LHSShiftAmt)->getZExtValue(); uint64_t RShVal = cast(RHSShiftAmt)->getZExtValue(); if ((LShVal + RShVal) != OpSizeInBits) return nullptr; SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { APInt Mask = APInt::getAllOnesValue(OpSizeInBits); if (LHSMask.getNode()) { APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal); Mask &= cast(LHSMask)->getAPIntValue() | RHSBits; } if (RHSMask.getNode()) { APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal); Mask &= cast(RHSMask)->getAPIntValue() | LHSBits; } Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, VT)); } return Rot.getNode(); } // If there is a mask here, and we have a variable shift, we can't be sure // that we're masking out the right stuff. if (LHSMask.getNode() || RHSMask.getNode()) return nullptr; // If the shift amount is sign/zext/any-extended just peel it off. SDValue LExtOp0 = LHSShiftAmt; SDValue RExtOp0 = RHSShiftAmt; if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) && (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) { LExtOp0 = LHSShiftAmt.getOperand(0); RExtOp0 = RHSShiftAmt.getOperand(0); } SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL); if (TryL) return TryL; SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL); if (TryR) return TryR; return nullptr; } SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue LHS, RHS, CC; ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; // fold (xor x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; } // fold (xor undef, undef) -> 0. This is a common idiom (misuse). if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // fold (xor x, undef) -> undef if (N0.getOpcode() == ISD::UNDEF) return N0; if (N1.getOpcode() == ISD::UNDEF) return N1; // fold (xor c1, c2) -> c1^c2 if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C); // canonicalize constant to RHS if (N0C && !N1C) return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0); // fold (xor x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // reassociate xor SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1); if (RXOR.getNode()) return RXOR; // fold !(x cc y) -> (x !cc y) if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) { bool isInt = LHS.getValueType().isInteger(); ISD::CondCode NotCC = ISD::getSetCCInverse(cast(CC)->get(), isInt); if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { switch (N0.getOpcode()) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC); case ISD::SELECT_CC: return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2), N0.getOperand(3), NotCC); } } } // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) if (N1C && N1C->getAPIntValue() == 1 && N0.getOpcode() == ISD::ZERO_EXTEND && N0.getNode()->hasOneUse() && isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ SDValue V = N0.getOperand(0); V = DAG.getNode(ISD::XOR, SDLoc(N0), V.getValueType(), V, DAG.getConstant(1, V.getValueType())); AddToWorklist(V.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V); } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc if (N1C && N1C->getAPIntValue() == 1 && VT == MVT::i1 && (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants if (N1C && N1C->isAllOnesValue() && (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isa(RHS) || isa(LHS)) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } // fold (xor (and x, y), y) -> (and (not x), y) if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0->getOperand(1) == N1) { SDValue X = N0->getOperand(0); SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); AddToWorklist(NotX.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1); } // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2)) if (N1C && N0.getOpcode() == ISD::XOR) { ConstantSDNode *N00C = dyn_cast(N0.getOperand(0)); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); if (N00C) return DAG.getNode(ISD::XOR, SDLoc(N), VT, N0.getOperand(1), DAG.getConstant(N1C->getAPIntValue() ^ N00C->getAPIntValue(), VT)); if (N01C) return DAG.getNode(ISD::XOR, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(N1C->getAPIntValue() ^ N01C->getAPIntValue(), VT)); } // fold (xor x, x) -> 0 if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) if (N0.getOpcode() == N1.getOpcode()) { SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N); if (Tmp.getNode()) return Tmp; } // Simplify the expression using non-local knowledge. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } /// Handle transforms common to the three shifts, when the shift amount is a /// constant. SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) { // We can't and shouldn't fold opaque constants. if (Amt->isOpaque()) return SDValue(); SDNode *LHS = N->getOperand(0).getNode(); if (!LHS->hasOneUse()) return SDValue(); // We want to pull some binops through shifts, so that we have (and (shift)) // instead of (shift (and)), likewise for add, or, xor, etc. This sort of // thing happens with address calculations, so it's important to canonicalize // it. bool HighBitSet = false; // Can we transform this if the high bit is set? switch (LHS->getOpcode()) { default: return SDValue(); case ISD::OR: case ISD::XOR: HighBitSet = false; // We can only transform sra if the high bit is clear. break; case ISD::AND: HighBitSet = true; // We can only transform sra if the high bit is set. break; case ISD::ADD: if (N->getOpcode() != ISD::SHL) return SDValue(); // only shl(add) not sr[al](add). HighBitSet = false; // We can only transform sra if the high bit is clear. break; } // We require the RHS of the binop to be a constant and not opaque as well. ConstantSDNode *BinOpCst = dyn_cast(LHS->getOperand(1)); if (!BinOpCst || BinOpCst->isOpaque()) return SDValue(); // FIXME: disable this unless the input to the binop is a shift by a constant. // If it is not a shift, it pessimizes some common cases like: // // void foo(int *X, int i) { X[i & 1235] = 1; } // int bar(int *X, int i) { return X[i & 255]; } SDNode *BinOpLHSVal = LHS->getOperand(0).getNode(); if ((BinOpLHSVal->getOpcode() != ISD::SHL && BinOpLHSVal->getOpcode() != ISD::SRA && BinOpLHSVal->getOpcode() != ISD::SRL) || !isa(BinOpLHSVal->getOperand(1))) return SDValue(); EVT VT = N->getValueType(0); // If this is a signed shift right, and the high bit is modified by the // logical operation, do not perform the transformation. The highBitSet // boolean indicates the value of the high bit of the constant which would // cause it to be modified for this operation. if (N->getOpcode() == ISD::SRA) { bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative(); if (BinOpRHSSignSet != HighBitSet) return SDValue(); } if (!TLI.isDesirableToCommuteWithShift(LHS)) return SDValue(); // Fold the constants, shifting the binop RHS by the shift amount. SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)), N->getValueType(0), LHS->getOperand(1), N->getOperand(1)); assert(isa(NewRHS) && "Folding was not successful!"); // Create the new shift. SDValue NewShift = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(0)), VT, LHS->getOperand(0), N->getOperand(1)); // Create the new binop. return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS); } SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { assert(N->getOpcode() == ISD::TRUNCATE); assert(N->getOperand(0).getOpcode() == ISD::AND); // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) if (N->hasOneUse() && N->getOperand(0).hasOneUse()) { SDValue N01 = N->getOperand(0).getOperand(1); if (ConstantSDNode *N01C = isConstOrConstSplat(N01)) { EVT TruncVT = N->getValueType(0); SDValue N00 = N->getOperand(0).getOperand(0); APInt TruncC = N01C->getAPIntValue(); TruncC = TruncC.trunc(TruncVT.getScalarSizeInBits()); return DAG.getNode(ISD::AND, SDLoc(N), TruncVT, DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N00), DAG.getConstant(TruncC, TruncVT)); } } return SDValue(); } SDValue DAGCombiner::visitRotate(SDNode *N) { // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). if (N->getOperand(1).getOpcode() == ISD::TRUNCATE && N->getOperand(1).getOperand(0).getOpcode() == ISD::AND) { SDValue NewOp1 = distributeTruncateThroughAnd(N->getOperand(1).getNode()); if (NewOp1.getNode()) return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), N->getOperand(0), NewOp1); } return SDValue(); } SDValue DAGCombiner::visitSHL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; BuildVectorSDNode *N1CV = dyn_cast(N1); // If setcc produces all-one true value then: // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<isConstant()) { if (N0.getOpcode() == ISD::AND) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); BuildVectorSDNode *N01CV = dyn_cast(N01); if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && TLI.getBooleanContents(N00.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); } } else { N1C = isConstOrConstSplat(N1); } } } // fold (shl c1, c2) -> c1< 0 if (N0C && N0C->isNullValue()) return N0; // fold (shl x, c >= size(x)) -> undef if (N1C && N1C->getZExtValue() >= OpSizeInBits) return DAG.getUNDEF(VT); // fold (shl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // fold (shl undef, x) -> 0 if (N0.getOpcode() == ISD::UNDEF) return DAG.getConstant(0, VT); // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, VT); // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()); if (NewOp1.getNode()) return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); } if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) if (N1C && N0.getOpcode() == ISD::SHL) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t c1 = N0C1->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); if (c1 + c2 >= OpSizeInBits) return DAG.getConstant(0, VT); return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(c1 + c2, N1.getValueType())); } } // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2))) // For this to be valid, the second form must not preserve any of the bits // that are shifted out by the inner shift in the first form. This means // the outer shift size must be >= the number of bits added by the ext. // As a corollary, we don't care what kind of ext it is. if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) && N0.getOperand(0).getOpcode() == ISD::SHL) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { uint64_t c1 = N0Op0C1->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); EVT InnerShiftVT = N0Op0.getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); if (c2 >= OpSizeInBits - InnerShiftSize) { if (c1 + c2 >= OpSizeInBits) return DAG.getConstant(0, VT); return DAG.getNode(ISD::SHL, SDLoc(N0), VT, DAG.getNode(N0.getOpcode(), SDLoc(N0), VT, N0Op0->getOperand(0)), DAG.getConstant(c1 + c2, N1.getValueType())); } } } // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) // Only fold this if the inner zext has no other uses to avoid increasing // the total number of instructions. if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::SRL) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { uint64_t c1 = N0Op0C1->getZExtValue(); if (c1 < VT.getScalarSizeInBits()) { uint64_t c2 = N1C->getZExtValue(); if (c1 == c2) { SDValue NewOp0 = N0.getOperand(0); EVT CountVT = NewOp0.getOperand(1).getValueType(); SDValue NewSHL = DAG.getNode(ISD::SHL, SDLoc(N), NewOp0.getValueType(), NewOp0, DAG.getConstant(c2, CountVT)); AddToWorklist(NewSHL.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); } } } } // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or // (and (srl x, (sub c1, c2), MASK) // Only fold this if the inner shift has no other uses -- if it does, folding // this will increase the total number of instructions. if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t c1 = N0C1->getZExtValue(); if (c1 < OpSizeInBits) { uint64_t c2 = N1C->getZExtValue(); APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); SDValue Shift; if (c2 > c1) { Mask = Mask.shl(c2 - c1); Shift = DAG.getNode(ISD::SHL, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(c2 - c1, N1.getValueType())); } else { Mask = Mask.lshr(c1 - c2); Shift = DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(c1 - c2, N1.getValueType())); } return DAG.getNode(ISD::AND, SDLoc(N0), VT, Shift, DAG.getConstant(Mask, VT)); } } } // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) if (N1C && N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1)) { unsigned BitSize = VT.getScalarSizeInBits(); SDValue HiBitsMask = DAG.getConstant(APInt::getHighBitsSet(BitSize, BitSize - N1C->getZExtValue()), VT); return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0), HiBitsMask); } // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) // Variant of version done on multiply, except mul by a power of 2 is turned // into a shift. APInt Val; if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && (isa(N0.getOperand(1)) || isConstantSplatVector(N0.getOperand(1).getNode(), Val))) { SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1); } if (N1C) { SDValue NewSHL = visitShiftByConstant(N, N1C); if (NewSHL.getNode()) return NewSHL; } return SDValue(); } SDValue DAGCombiner::visitSRA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarType().getSizeInBits(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; N1C = isConstOrConstSplat(N1); } // fold (sra c1, c2) -> (sra c1, c2) if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C); // fold (sra 0, x) -> 0 if (N0C && N0C->isNullValue()) return N0; // fold (sra -1, x) -> -1 if (N0C && N0C->isAllOnesValue()) return N0; // fold (sra x, (setge c, size(x))) -> undef if (N1C && N1C->getZExtValue() >= OpSizeInBits) return DAG.getUNDEF(VT); // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); if (VT.isVector()) ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, VT.getVectorNumElements()); if ((!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT))) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), DAG.getValueType(ExtVT)); } // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) if (N1C && N0.getOpcode() == ISD::SRA) { if (ConstantSDNode *C1 = isConstOrConstSplat(N0.getOperand(1))) { unsigned Sum = N1C->getZExtValue() + C1->getZExtValue(); if (Sum >= OpSizeInBits) Sum = OpSizeInBits - 1; return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(Sum, N1.getValueType())); } } // fold (sra (shl X, m), (sub result_size, n)) // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for // result_size - n != m. // If truncate is free for the target sext(shl) is likely to result in better // code. if (N0.getOpcode() == ISD::SHL && N1C) { // Get the two constanst of the shifts, CN0 = m, CN = n. const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1)); if (N01C) { LLVMContext &Ctx = *DAG.getContext(); // Determine what the truncate's result bitsize and type would be. EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue()); if (VT.isVector()) TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); // Determine the residual right-shift amount. signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue(); // If the shift is not a no-op (in which case this should be just a sign // extend already), the truncated to type is legal, sign_extend is legal // on that type, and the truncate to that type is both legal and free, // perform the transform. if ((ShiftAmt > 0) && TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && TLI.isTruncateFree(VT, TruncVT)) { SDValue Amt = DAG.getConstant(ShiftAmt, getShiftAmountTy(N0.getOperand(0).getValueType())); SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0), VT, N0.getOperand(0), Amt); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), TruncVT, Shift); return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Trunc); } } } // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()); if (NewOp1.getNode()) return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); } // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes if (N0.getOpcode() == ISD::TRUNCATE && (N0.getOperand(0).getOpcode() == ISD::SRL || N0.getOperand(0).getOpcode() == ISD::SRA) && N0.getOperand(0).hasOneUse() && N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { unsigned LargeShiftVal = LargeShift->getZExtValue(); EVT LargeVT = N0Op0.getValueType(); if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) { SDValue Amt = DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), getShiftAmountTy(N0Op0.getOperand(0).getValueType())); SDValue SRA = DAG.getNode(ISD::SRA, SDLoc(N), LargeVT, N0Op0.getOperand(0), Amt); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, SRA); } } } // Simplify, based on bits shifted out of the LHS. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // If the sign bit is known to be zero, switch this to a SRL. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); if (N1C) { SDValue NewSRA = visitShiftByConstant(N, N1C); if (NewSRA.getNode()) return NewSRA; } return SDValue(); } SDValue DAGCombiner::visitSRL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarType().getSizeInBits(); // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; N1C = isConstOrConstSplat(N1); } // fold (srl c1, c2) -> c1 >>u c2 if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C); // fold (srl 0, x) -> 0 if (N0C && N0C->isNullValue()) return N0; // fold (srl x, c >= size(x)) -> undef if (N1C && N1C->getZExtValue() >= OpSizeInBits) return DAG.getUNDEF(VT); // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // if (srl x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, VT); // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) if (N1C && N0.getOpcode() == ISD::SRL) { if (ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1))) { uint64_t c1 = N01C->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); if (c1 + c2 >= OpSizeInBits) return DAG.getConstant(0, VT); return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(c1 + c2, N1.getValueType())); } } // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) if (N1C && N0.getOpcode() == ISD::TRUNCATE && N0.getOperand(0).getOpcode() == ISD::SRL && isa(N0.getOperand(0)->getOperand(1))) { uint64_t c1 = cast(N0.getOperand(0)->getOperand(1))->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); EVT InnerShiftVT = N0.getOperand(0).getValueType(); EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarType().getSizeInBits(); // This is only valid if the OpSizeInBits + c1 = size of inner shift. if (c1 + OpSizeInBits == InnerShiftSize) { if (c1 + c2 >= InnerShiftSize) return DAG.getConstant(0, VT); return DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, DAG.getNode(ISD::SRL, SDLoc(N0), InnerShiftVT, N0.getOperand(0)->getOperand(0), DAG.getConstant(c1 + c2, ShiftCountVT))); } } // fold (srl (shl x, c), c) -> (and x, cst2) if (N1C && N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1) { unsigned BitSize = N0.getScalarValueSizeInBits(); if (BitSize <= 64) { uint64_t ShAmt = N1C->getZExtValue() + 64 - BitSize; return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(~0ULL >> ShAmt, VT)); } } // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { // Shifting in all undef bits? EVT SmallVT = N0.getOperand(0).getValueType(); unsigned BitSize = SmallVT.getScalarSizeInBits(); if (N1C->getZExtValue() >= BitSize) return DAG.getUNDEF(VT); if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { uint64_t ShiftAmt = N1C->getZExtValue(); SDValue SmallShift = DAG.getNode(ISD::SRL, SDLoc(N0), SmallVT, N0.getOperand(0), DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT))); AddToWorklist(SmallShift.getNode()); APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt); return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SmallShift), DAG.getConstant(Mask, VT)); } } // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign // bit, which is unmodified by sra. if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) { if (N0.getOpcode() == ISD::SRA) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); } // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). if (N1C && N0.getOpcode() == ISD::CTLZ && N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { APInt KnownZero, KnownOne; DAG.computeKnownBits(N0.getOperand(0), KnownZero, KnownOne); // If any of the input bits are KnownOne, then the input couldn't be all // zeros, thus the result of the srl will always be zero. if (KnownOne.getBoolValue()) return DAG.getConstant(0, VT); // If all of the bits input the to ctlz node are known to be zero, then // the result of the ctlz is "32" and the result of the shift is one. APInt UnknownBits = ~KnownZero; if (UnknownBits == 0) return DAG.getConstant(1, VT); // Otherwise, check to see if there is exactly one bit input to the ctlz. if ((UnknownBits & (UnknownBits - 1)) == 0) { // Okay, we know that only that the single bit specified by UnknownBits // could be set on input to the CTLZ node. If this bit is set, the SRL // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair // to an SRL/XOR pair, which is likely to simplify more. unsigned ShAmt = UnknownBits.countTrailingZeros(); SDValue Op = N0.getOperand(0); if (ShAmt) { Op = DAG.getNode(ISD::SRL, SDLoc(N0), VT, Op, DAG.getConstant(ShAmt, getShiftAmountTy(Op.getValueType()))); AddToWorklist(Op.getNode()); } return DAG.getNode(ISD::XOR, SDLoc(N), VT, Op, DAG.getConstant(1, VT)); } } // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()); if (NewOp1.getNode()) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); } // fold operands of srl based on knowledge that the low bits are not // demanded. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); if (N1C) { SDValue NewSRL = visitShiftByConstant(N, N1C); if (NewSRL.getNode()) return NewSRL; } // Attempt to convert a srl of a load into a narrower zero-extending load. SDValue NarrowLoad = ReduceLoadWidth(N); if (NarrowLoad.getNode()) return NarrowLoad; // Here is a common situation. We want to optimize: // // %a = ... // %b = and i32 %a, 2 // %c = srl i32 %b, 1 // brcond i32 %c ... // // into // // %a = ... // %b = and %a, 2 // %c = setcc eq %b, 0 // brcond %c ... // // However when after the source operand of SRL is optimized into AND, the SRL // itself may not be optimized further. Look for it and add the BRCOND into // the worklist. if (N->hasOneUse()) { SDNode *Use = *N->use_begin(); if (Use->getOpcode() == ISD::BRCOND) AddToWorklist(Use); else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { // Also look pass the truncate. Use = *Use->use_begin(); if (Use->getOpcode() == ISD::BRCOND) AddToWorklist(Use); } } return SDValue(); } SDValue DAGCombiner::visitCTLZ(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctlz c1) -> c2 if (isa(N0)) return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctlz_zero_undef c1) -> c2 if (isa(N0)) return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTTZ(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (cttz c1) -> c2 if (isa(N0)) return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (cttz_zero_undef c1) -> c2 if (isa(N0)) return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTPOP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctpop c1) -> c2 if (isa(N0)) return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0); return SDValue(); } /// \brief Generate Min/Max node static SDValue combineMinNumMaxNum(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC, const TargetLowering &TLI, SelectionDAG &DAG) { if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); switch (CC) { case ISD::SETOLT: case ISD::SETOLE: case ISD::SETLT: case ISD::SETLE: case ISD::SETULT: case ISD::SETULE: { unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; if (TLI.isOperationLegal(Opcode, VT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); return SDValue(); } case ISD::SETOGT: case ISD::SETOGE: case ISD::SETGT: case ISD::SETGE: case ISD::SETUGT: case ISD::SETUGE: { unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; if (TLI.isOperationLegal(Opcode, VT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); return SDValue(); } default: return SDValue(); } } SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); ConstantSDNode *N2C = dyn_cast(N2); EVT VT = N->getValueType(0); EVT VT0 = N0.getValueType(); // fold (select C, X, X) -> X if (N1 == N2) return N1; // fold (select true, X, Y) -> X if (N0C && !N0C->isNullValue()) return N1; // fold (select false, X, Y) -> Y if (N0C && N0C->isNullValue()) return N2; // fold (select C, 1, X) -> (or C, X) if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1) return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2); // fold (select C, 0, 1) -> (xor C, 1) // We can't do this reliably if integer based booleans have different contents // to floating point based booleans. This is because we can't tell whether we // have an integer-based boolean or a floating-point-based boolean unless we // can find the SETCC that produced it and inspect its operands. This is // fairly easy if C is the SETCC node, but it can potentially be // undiscoverable (or not reasonably discoverable). For example, it could be // in another basic block or it could require searching a complicated // expression. if (VT.isInteger() && (VT0 == MVT::i1 || (VT0.isInteger() && TLI.getBooleanContents(false, false) == TLI.getBooleanContents(false, true) && TLI.getBooleanContents(false, false) == TargetLowering::ZeroOrOneBooleanContent)) && N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) { SDValue XORNode; if (VT == VT0) return DAG.getNode(ISD::XOR, SDLoc(N), VT0, N0, DAG.getConstant(1, VT0)); XORNode = DAG.getNode(ISD::XOR, SDLoc(N0), VT0, N0, DAG.getConstant(1, VT0)); AddToWorklist(XORNode.getNode()); if (VT.bitsGT(VT0)) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, XORNode); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, XORNode); } // fold (select C, 0, X) -> (and (not C), X) if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2); } // fold (select C, X, 1) -> (or (not C), X) if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1); } // fold (select C, X, 0) -> (and C, X) if (VT == MVT::i1 && N2C && N2C->isNullValue()) return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1); // fold (select X, X, Y) -> (or X, Y) // fold (select X, 1, Y) -> (or X, Y) if (VT == MVT::i1 && (N0 == N1 || (N1C && N1C->getAPIntValue() == 1))) return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2); // fold (select X, Y, X) -> (and X, Y) // fold (select X, Y, 0) -> (and X, Y) if (VT == MVT::i1 && (N0 == N2 || (N2C && N2C->getAPIntValue() == 0))) return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1); // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. // fold selects based on a setcc into other things, such as min/max/abs if (N0.getOpcode() == ISD::SETCC) { // select x, y (fcmp lt x, y) -> fminnum x, y // select x, y (fcmp gt x, y) -> fmaxnum x, y // // This is OK if we don't care about what happens if either operand is a // NaN. // // FIXME: Instead of testing for UnsafeFPMath, this should be checking for // no signed zeros as well as no nans. const TargetOptions &Options = DAG.getTarget().Options; if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() && DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { ISD::CondCode CC = cast(N0.getOperand(2))->get(); SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG); if (FMinMax) return FMinMax; } if ((!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || TLI.isOperationLegal(ISD::SELECT_CC, VT)) return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), N1, N2, N0.getOperand(2)); return SimplifySelect(SDLoc(N), N0, N1, N2); } return SDValue(); } static std::pair SplitVSETCC(const SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); // Split the inputs. SDValue Lo, Hi, LL, LH, RL, RH; std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); return std::make_pair(Lo, Hi); } // This function assumes all the vselect's arguments are CONCAT_VECTOR // nodes and that the condition is a BV of ConstantSDNodes (or undefs). static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = N->getValueType(0); int NumElems = VT.getVectorNumElements(); assert(LHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getOpcode() == ISD::CONCAT_VECTORS && Cond.getOpcode() == ISD::BUILD_VECTOR); // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about // binary ones here. if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2) return SDValue(); // We're sure we have an even number of elements due to the // concat_vectors we have as arguments to vselect. // Skip BV elements until we find one that's not an UNDEF // After we find an UNDEF element, keep looping until we get to half the // length of the BV and see if all the non-undef nodes are the same. ConstantSDNode *BottomHalf = nullptr; for (int i = 0; i < NumElems / 2; ++i) { if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF) continue; if (BottomHalf == nullptr) BottomHalf = cast(Cond.getOperand(i)); else if (Cond->getOperand(i).getNode() != BottomHalf) return SDValue(); } // Do the same for the second half of the BuildVector ConstantSDNode *TopHalf = nullptr; for (int i = NumElems / 2; i < NumElems; ++i) { if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF) continue; if (TopHalf == nullptr) TopHalf = cast(Cond.getOperand(i)); else if (Cond->getOperand(i).getNode() != TopHalf) return SDValue(); } assert(TopHalf && BottomHalf && "One half of the selector was all UNDEFs and the other was all the " "same value. This should have been addressed before this function."); return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0), TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); } SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedStoreSDNode *MST = dyn_cast(N); SDValue Mask = MST->getMask(); - SDValue Data = MST->getData(); + SDValue Data = MST->getValue(); SDLoc DL(N); // If the MSTORE data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0)); SDValue Chain = MST->getChain(); SDValue Ptr = MST->getBasePtr(); EVT MemoryVT = MST->getMemoryVT(); unsigned Alignment = MST->getOriginalAlignment(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == Data->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MST->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, MST->getAAInfo(), MST->getRanges()); - Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO); + Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, + MST->isTruncatingStore()); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); MMO = DAG.getMachineFunction(). getMachineMemOperand(MST->getPointerInfo(), MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment, MST->getAAInfo(), MST->getRanges()); - Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO); + Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, + MST->isTruncatingStore()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } return SDValue(); } SDValue DAGCombiner::visitMLOAD(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedLoadSDNode *MLD = dyn_cast(N); SDValue Mask = MLD->getMask(); SDLoc DL(N); // If the MLOAD result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { EVT VT = N->getValueType(0); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Src0 = MLD->getSrc0(); SDValue Src0Lo, Src0Hi; std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); SDValue Chain = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); EVT MemoryVT = MLD->getMemoryVT(); unsigned Alignment = MLD->getOriginalAlignment(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); - Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO); + Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO, + ISD::NON_EXTLOAD); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); - Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO); + Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO, + ISD::NON_EXTLOAD); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); // Build a factor node to remember that this load is independent of the // other one. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain); SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); SDValue RetOps[] = { LoadRes, Chain }; return DAG.getMergeValues(RetOps, DL); } return SDValue(); } SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDLoc DL(N); // Canonicalize integer abs. // vselect (setg[te] X, 0), X, -X -> // vselect (setgt X, -1), X, -X -> // vselect (setl[te] X, 0), -X, X -> // Y = sra (X, size(X)-1); xor (add (X, Y), Y) if (N0.getOpcode() == ISD::SETCC) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); ISD::CondCode CC = cast(N0.getOperand(2))->get(); bool isAbs = false; bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) || (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) && N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1)) isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode()); else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) && N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1)) isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); if (isAbs) { EVT VT = LHS.getValueType(); SDValue Shift = DAG.getNode( ISD::SRA, DL, VT, LHS, DAG.getConstant(VT.getScalarType().getSizeInBits() - 1, VT)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); } } // If the VSELECT result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (N0.getOpcode() == ISD::SETCC) { EVT VT = N->getValueType(0); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH; std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG); std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1); std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2); Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL); Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH); // Add the new VSELECT nodes to the work list in case they need to be split // again. AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } // Fold (vselect (build_vector all_ones), N1, N2) -> N1 if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; // Fold (vselect (build_vector all_zeros), N1, N2) -> N2 if (ISD::isBuildVectorAllZeros(N0.getNode())) return N2; // The ConvertSelectToConcatVector function is assuming both the above // checks for (vselect (build_vector all{ones,zeros) ...) have been made // and addressed. if (N1.getOpcode() == ISD::CONCAT_VECTORS && N2.getOpcode() == ISD::CONCAT_VECTORS && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { SDValue CV = ConvertSelectToConcatVector(N, DAG); if (CV.getNode()) return CV; } return SDValue(); } SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDValue N3 = N->getOperand(3); SDValue N4 = N->getOperand(4); ISD::CondCode CC = cast(N4)->get(); // fold select_cc lhs, rhs, x, x, cc -> x if (N2 == N3) return N2; // Determine if the condition we're dealing with is constant SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, SDLoc(N), false); if (SCC.getNode()) { AddToWorklist(SCC.getNode()); if (ConstantSDNode *SCCC = dyn_cast(SCC.getNode())) { if (!SCCC->isNullValue()) return N2; // cond always true -> true val else return N3; // cond always false -> false val } else if (SCC->getOpcode() == ISD::UNDEF) { // When the condition is UNDEF, just return the first operand. This is // coherent the DAG creation, no setcc node is created in this case return N2; } else if (SCC.getOpcode() == ISD::SETCC) { // Fold to a simpler select_cc return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), SCC.getOperand(1), N2, N3, SCC.getOperand(2)); } } // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N2, N3)) return SDValue(N, 0); // Don't revisit N. // fold select_cc into other things, such as min/max/abs return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC); } SDValue DAGCombiner::visitSETCC(SDNode *N) { return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1), cast(N->getOperand(2))->get(), SDLoc(N)); } // tryToFoldExtendOfConstant - Try to fold a sext/zext/aext // dag node into a ConstantSDNode or a build_vector of constants. // This function is called by the DAGCombiner when visiting sext/zext/aext // dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). // Vector extends are not folded if operations are legal; this is to // avoid introducing illegal build_vector dag nodes. static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, SelectionDAG &DAG, bool LegalTypes, bool LegalOperations) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!"); // fold (sext c1) -> c1 // fold (zext c1) -> c1 // fold (aext c1) -> c1 if (isa(N0)) return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode(); // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) EVT SVT = VT.getScalarType(); if (!(VT.isVector() && (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) && ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) return nullptr; // We can fold this node into a build_vector. unsigned VTBits = SVT.getSizeInBits(); unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits(); unsigned ShAmt = VTBits - EVTBits; SmallVector Elts; unsigned NumElts = N0->getNumOperands(); SDLoc DL(N); for (unsigned i=0; i != NumElts; ++i) { SDValue Op = N0->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } ConstantSDNode *CurrentND = cast(Op); const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue()); if (Opcode == ISD::SIGN_EXTEND) Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(), SVT)); else Elts.push_back(DAG.getConstant(C.shl(ShAmt).lshr(ShAmt).getZExtValue(), SVT)); } return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts).getNode(); } // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" // transformation. Returns true if extension are possible and the above // mentioned transformation is profitable. static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0, unsigned ExtOpc, SmallVectorImpl &ExtendNodes, const TargetLowering &TLI) { bool HasCopyToRegUses = false; bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType()); for (SDNode::use_iterator UI = N0.getNode()->use_begin(), UE = N0.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User == N) continue; if (UI.getUse().getResNo() != N0.getResNo()) continue; // FIXME: Only extend SETCC N, N and SETCC N, c for now. if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) { ISD::CondCode CC = cast(User->getOperand(2))->get(); if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC)) // Sign bits will be lost after a zext. return false; bool Add = false; for (unsigned i = 0; i != 2; ++i) { SDValue UseOp = User->getOperand(i); if (UseOp == N0) continue; if (!isa(UseOp)) return false; Add = true; } if (Add) ExtendNodes.push_back(User); continue; } // If truncates aren't free and there are users we can't // extend, it isn't worthwhile. if (!isTruncFree) return false; // Remember if this value is live-out. if (User->getOpcode() == ISD::CopyToReg) HasCopyToRegUses = true; } if (HasCopyToRegUses) { bool BothLiveOut = false; for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDUse &Use = UI.getUse(); if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) { BothLiveOut = true; break; } } if (BothLiveOut) // Both unextended and extended values are live out. There had better be // a good reason for the transformation. return ExtendNodes.size(); } return true; } void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue Trunc, SDValue ExtLoad, SDLoc DL, ISD::NodeType ExtType) { // Extend SetCC uses if necessary. for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) { SDNode *SetCC = SetCCs[i]; SmallVector Ops; for (unsigned j = 0; j != 2; ++j) { SDValue SOp = SetCC->getOperand(j); if (SOp == Trunc) Ops.push_back(ExtLoad); else Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); } Ops.push_back(SetCC->getOperand(2)); CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops)); } } SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N0.getOperand(0)); if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); if (NarrowLoad.getNode()) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } // See if the value being truncated is already sign extended. If so, just // eliminate the trunc/sext pair. SDValue Op = N0.getOperand(0); unsigned OpBits = Op.getValueType().getScalarType().getSizeInBits(); unsigned MidBits = N0.getValueType().getScalarType().getSizeInBits(); unsigned DestBits = VT.getScalarType().getSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op); if (OpBits == DestBits) { // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign // bits, it is already ready. if (NumSignBits > DestBits-MidBits) return Op; } else if (OpBits < DestBits) { // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign // bits, just sext from i32. if (NumSignBits > OpBits-MidBits) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op); } else { // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign // bits, just truncate to i32. if (NumSignBits > OpBits-MidBits) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); } // fold (sext (truncate x)) -> (sextinreg x). if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, N0.getValueType())) { if (OpBits < DestBits) Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); else if (OpBits > DestBits) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op, DAG.getValueType(N0.getValueType())); } } // fold (sext (load x)) -> (sext (truncate (sextload x))) // None of the supported targets knows how to perform load and sign extend // on vectors in one instruction. We only perform this transformation on // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && ((!LegalOperations && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (sext (sextload x)) -> (sext (truncate (sextload x))) // fold (sext ( extload x)) -> (sext (truncate (sextload x))) if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (sext (and/or/xor (load x), cst)) -> // (and/or/xor (sextload x), (sext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN0 = cast(N0.getOperand(0)); if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND, SetCCs, TLI); if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.sext(VT.getSizeInBits()); SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ExtLoad, DAG.getConstant(Mask, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); CombineTo(N, And); CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } if (N0.getOpcode() == ISD::SETCC) { EVT N0VT = N0.getOperand(0).getValueType(); // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && TLI.getBooleanContents(N0VT) == TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. EVT SVT = getSetCCResultType(N0VT); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == SVT.getSizeInBits()) return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/sign extend EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); if (SVT == MatchingVectorType) { SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT); } } // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0) unsigned ElementWidth = VT.getScalarType().getSizeInBits(); SDValue NegOne = DAG.getConstant(APInt::getAllOnesValue(ElementWidth), VT); SDValue SCC = SimplifySelectCC(SDLoc(N), N0.getOperand(0), N0.getOperand(1), NegOne, DAG.getConstant(0, VT), cast(N0.getOperand(2))->get(), true); if (SCC.getNode()) return SCC; if (!VT.isVector()) { EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType()); if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, SetCCVT)) { SDLoc DL(N); ISD::CondCode CC = cast(N0.getOperand(2))->get(); SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); return DAG.getSelect(DL, VT, SetCC, NegOne, DAG.getConstant(0, VT)); } } } // fold (sext x) -> (zext x) if the sign bit is known zero. if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0); return SDValue(); } // isTruncateOf - If N is a truncate of some other value, return true, record // the value being truncated in Op and which of Op's bits are zero in KnownZero. // This function computes KnownZero to avoid a duplicated call to // computeKnownBits in the caller. static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, APInt &KnownZero) { APInt KnownOne; if (N->getOpcode() == ISD::TRUNCATE) { Op = N->getOperand(0); DAG.computeKnownBits(Op, KnownZero, KnownOne); return true; } if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 || cast(N->getOperand(2))->get() != ISD::SETNE) return false; SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); assert(Op0.getValueType() == Op1.getValueType()); ConstantSDNode *COp0 = dyn_cast(Op0); ConstantSDNode *COp1 = dyn_cast(Op1); if (COp0 && COp0->isNullValue()) Op = Op1; else if (COp1 && COp1->isNullValue()) Op = Op0; else return false; DAG.computeKnownBits(Op, KnownZero, KnownOne); if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue()) return false; return true; } SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (zext (zext x)) -> (zext x) // fold (zext (aext x)) -> (zext x) if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); // fold (zext (truncate x)) -> (zext x) or // (zext (truncate x)) -> (truncate x) // This is valid when the truncated bits of x are already zero. // FIXME: We should extend this to work for vectors too. SDValue Op; APInt KnownZero; if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) { APInt TruncatedBits = (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ? APInt(Op.getValueSizeInBits(), 0) : APInt::getBitsSet(Op.getValueSizeInBits(), N0.getValueSizeInBits(), std::min(Op.getValueSizeInBits(), VT.getSizeInBits())); if (TruncatedBits == (KnownZero & TruncatedBits)) { if (VT.bitsGT(Op.getValueType())) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op); if (VT.bitsLT(Op.getValueType())) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); return Op; } } // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); if (NarrowLoad.getNode()) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (zext (truncate x)) -> (and x, mask) if (N0.getOpcode() == ISD::TRUNCATE && (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); if (NarrowLoad.getNode()) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } SDValue Op = N0.getOperand(0); if (Op.getValueType().bitsLT(VT)) { Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); AddToWorklist(Op.getNode()); } else if (Op.getValueType().bitsGT(VT)) { Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); AddToWorklist(Op.getNode()); } return DAG.getZeroExtendInReg(Op, SDLoc(N), N0.getValueType().getScalarType()); } // Fold (zext (and (trunc x), cst)) -> (and x, cst), // if either of the casts is not free. if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), N0.getValueType()) || !TLI.isZExtFree(N0.getValueType(), VT))) { SDValue X = N0.getOperand(0).getOperand(0); if (X.getValueType().bitsLT(VT)) { X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X); } else if (X.getValueType().bitsGT(VT)) { X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); } APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, SDLoc(N), VT, X, DAG.getConstant(Mask, VT)); } // fold (zext (load x)) -> (zext (truncate (zextload x))) // None of the supported targets knows how to perform load and vector_zext // on vectors in one instruction. We only perform this transformation on // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && ((!LegalOperations && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN0 = cast(N0.getOperand(0)); if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND, SetCCs, TLI); if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ExtLoad, DAG.getConstant(Mask, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); CombineTo(N, And); CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } // fold (zext (zextload x)) -> (zext (truncate (zextload x))) // fold (zext ( extload x)) -> (zext (truncate (zextload x))) if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } if (N0.getOpcode() == ISD::SETCC) { if (!LegalOperations && VT.isVector() && N0.getValueType().getVectorElementType() == MVT::i1) { EVT N0VT = N0.getOperand(0).getValueType(); if (getSetCCResultType(N0VT) == N0.getValueType()) return SDValue(); // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors. // Only do this before legalize for now. EVT EltVT = VT.getVectorElementType(); SmallVector OneOps(VT.getVectorNumElements(), DAG.getConstant(1, EltVT)); if (VT.getSizeInBits() == N0VT.getSizeInBits()) // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()), DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, OneOps)); // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/sign extend EVT MatchingElementType = EVT::getIntegerVT(*DAG.getContext(), N0VT.getScalarType().getSizeInBits()); EVT MatchingVectorType = EVT::getVectorVT(*DAG.getContext(), MatchingElementType, N0VT.getVectorNumElements()); SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT), DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, OneOps)); } // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc SDValue SCC = SimplifySelectCC(SDLoc(N), N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, VT), DAG.getConstant(0, VT), cast(N0.getOperand(2))->get(), true); if (SCC.getNode()) return SCC; } // (zext (shl (zext x), cst)) -> (shl (zext x), cst) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && isa(N0.getOperand(1)) && N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse()) { SDValue ShAmt = N0.getOperand(1); unsigned ShAmtVal = cast(ShAmt)->getZExtValue(); if (N0.getOpcode() == ISD::SHL) { SDValue InnerZExt = N0.getOperand(0); // If the original shl may be shifting out bits, do not perform this // transformation. unsigned KnownZeroBits = InnerZExt.getValueType().getSizeInBits() - InnerZExt.getOperand(0).getValueType().getSizeInBits(); if (ShAmtVal > KnownZeroBits) return SDValue(); } SDLoc DL(N); // Ensure that the shift amount is wide enough for the shifted value. if (VT.getSizeInBits() >= 256) ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); return DAG.getNode(N0.getOpcode(), DL, VT, DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), ShAmt); } return SDValue(); } SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); if (NarrowLoad.getNode()) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (aext (truncate x)) if (N0.getOpcode() == ISD::TRUNCATE) { SDValue TruncOp = N0.getOperand(0); if (TruncOp.getValueType() == VT) return TruncOp; // x iff x size == zext size. if (TruncOp.getValueType().bitsGT(VT)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp); return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp); } // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), N0.getValueType())) { SDValue X = N0.getOperand(0).getOperand(0); if (X.getValueType().bitsLT(VT)) { X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, X); } else if (X.getValueType().bitsGT(VT)) { X = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X); } APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, SDLoc(N), VT, X, DAG.getConstant(Mask, VT)); } // fold (aext (load x)) -> (aext (truncate (extload x))) // None of the supported targets knows how to perform load and any_ext // on vectors in one instruction. We only perform this transformation on // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ANY_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (aext (zextload x)) -> (aext (truncate (zextload x))) // fold (aext (sextload x)) -> (aext (truncate (sextload x))) // fold (aext ( extload x)) -> (aext (truncate (extload x))) if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); ISD::LoadExtType ExtType = LN0->getExtensionType(); EVT MemVT = LN0->getMemoryVT(); if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } if (N0.getOpcode() == ISD::SETCC) { // For vectors: // aext(setcc) -> vsetcc // aext(setcc) -> truncate(vsetcc) // aext(setcc) -> aext(vsetcc) // Only do this before legalize for now. if (VT.isVector() && !LegalOperations) { EVT N0VT = N0.getOperand(0).getValueType(); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == N0VT.getSizeInBits()) return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/any extend else { EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); } } // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc SDValue SCC = SimplifySelectCC(SDLoc(N), N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, VT), DAG.getConstant(0, VT), cast(N0.getOperand(2))->get(), true); if (SCC.getNode()) return SCC; } return SDValue(); } /// See if the specified operand can be simplified with the knowledge that only /// the bits specified by Mask are used. If so, return the simpler operand, /// otherwise return a null SDValue. SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { switch (V.getOpcode()) { default: break; case ISD::Constant: { const ConstantSDNode *CV = cast(V.getNode()); assert(CV && "Const value should be ConstSDNode."); const APInt &CVal = CV->getAPIntValue(); APInt NewVal = CVal & Mask; if (NewVal != CVal) return DAG.getConstant(NewVal, V.getValueType()); break; } case ISD::OR: case ISD::XOR: // If the LHS or RHS don't contribute bits to the or, drop them. if (DAG.MaskedValueIsZero(V.getOperand(0), Mask)) return V.getOperand(1); if (DAG.MaskedValueIsZero(V.getOperand(1), Mask)) return V.getOperand(0); break; case ISD::SRL: // Only look at single-use SRLs. if (!V.getNode()->hasOneUse()) break; if (ConstantSDNode *RHSC = dyn_cast(V.getOperand(1))) { // See if we can recursively simplify the LHS. unsigned Amt = RHSC->getZExtValue(); // Watch out for shift count overflow though. if (Amt >= Mask.getBitWidth()) break; APInt NewMask = Mask << Amt; SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask); if (SimplifyLHS.getNode()) return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, V.getOperand(1)); } } return SDValue(); } /// If the result of a wider load is shifted to right of N bits and then /// truncated to a narrower type and where N is a multiple of number of bits of /// the narrower type, transform it to a narrower load from address + N / num of /// bits of new type. If the result is to be extended, also fold the extension /// to form a extending load. SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { unsigned Opc = N->getOpcode(); ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT ExtVT = VT; // This transformation isn't valid for vector loads. if (VT.isVector()) return SDValue(); // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then // extended to VT. if (Opc == ISD::SIGN_EXTEND_INREG) { ExtType = ISD::SEXTLOAD; ExtVT = cast(N->getOperand(1))->getVT(); } else if (Opc == ISD::SRL) { // Another special-case: SRL is basically zero-extending a narrower value. ExtType = ISD::ZEXTLOAD; N0 = SDValue(N, 0); ConstantSDNode *N01 = dyn_cast(N0.getOperand(1)); if (!N01) return SDValue(); ExtVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() - N01->getZExtValue()); } if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT)) return SDValue(); unsigned EVTBits = ExtVT.getSizeInBits(); // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!ExtVT.isRound()) return SDValue(); unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShAmt = N01->getZExtValue(); // Is the shift amount a multiple of size of VT? if ((ShAmt & (EVTBits-1)) == 0) { N0 = N0.getOperand(0); // Is the load width a multiple of size of VT? if ((N0.getValueType().getSizeInBits() & (EVTBits-1)) != 0) return SDValue(); } // At this point, we must have a load or else we can't do the transform. if (!isa(N0)) return SDValue(); // Because a SRL must be assumed to *need* to zero-extend the high bits // (as opposed to anyext the high bits), we can't combine the zextload // lowering of SRL and an sextload. if (cast(N0)->getExtensionType() == ISD::SEXTLOAD) return SDValue(); // If the shift amount is larger than the input type then we're not // accessing any of the loaded bytes. If the load was a zextload/extload // then the result of the shift+trunc is zero/undef (handled elsewhere). if (ShAmt >= cast(N0)->getMemoryVT().getSizeInBits()) return SDValue(); } } // If the load is shifted left (and the result isn't shifted back right), // we can fold the truncate through the shift. unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShLeftAmt = N01->getZExtValue(); N0 = N0.getOperand(0); } } // If we haven't found a load, we can't narrow it. Don't transform one with // multiple uses, this would require adding a new load. if (!isa(N0) || !N0.hasOneUse()) return SDValue(); // Don't change the width of a volatile load. LoadSDNode *LN0 = cast(N0); if (LN0->isVolatile()) return SDValue(); // Verify that we are actually reducing a load width here. if (LN0->getMemoryVT().getSizeInBits() < EVTBits) return SDValue(); // For the transform to be legal, the load must produce only two values // (the value loaded and the chain). Don't transform a pre-increment // load, for example, which produces an extra value. Otherwise the // transformation is not equivalent, and the downstream logic to replace // uses gets things wrong. if (LN0->getNumValues() > 2) return SDValue(); // If the load that we're shrinking is an extload and we're not just // discarding the extension we can't simply shrink the load. Bail. // TODO: It would be possible to merge the extensions in some cases. if (LN0->getExtensionType() != ISD::NON_EXTLOAD && LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) return SDValue(); if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT)) return SDValue(); EVT PtrType = N0.getOperand(1).getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) // It's not possible to generate a constant of extended or untyped type. return SDValue(); // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (TLI.isBigEndian()) { unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits(); unsigned EVTStoreBits = ExtVT.getStoreSizeInBits(); ShAmt = LVTStoreBits - EVTStoreBits - ShAmt; } uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LN0), PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, PtrType)); AddToWorklist(NewPtr.getNode()); SDValue Load; if (ExtType == ISD::NON_EXTLOAD) Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), NewAlign, LN0->getAAInfo()); else Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), NewAlign, LN0->getAAInfo()); // Replace the old load's chain with the new load's chain. WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); // Shift the result left, if we've swallowed a left shift. SDValue Result = Load; if (ShLeftAmt != 0) { EVT ShImmTy = getShiftAmountTy(Result.getValueType()); if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt)) ShImmTy = VT; // If the shift amount is as large as the result size (but, presumably, // no larger than the source) then the useful bits of the result are // zero; we can't simply return the shortened shift, because the result // of that operation is undefined. if (ShLeftAmt >= VT.getSizeInBits()) Result = DAG.getConstant(0, VT); else Result = DAG.getNode(ISD::SHL, SDLoc(N0), VT, Result, DAG.getConstant(ShLeftAmt, ShImmTy)); } // Return the new loaded value. return Result; } SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT EVT = cast(N1)->getVT(); unsigned VTBits = VT.getScalarType().getSizeInBits(); unsigned EVTBits = EVT.getScalarType().getSizeInBits(); // fold (sext_in_reg c1) -> c1 if (isa(N0) || N0.getOpcode() == ISD::UNDEF) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); // If the input is already sign extended, just drop the extension. if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1) return N0; // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && EVT.bitsLT(cast(N0.getOperand(1))->getVT())) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), N1); // fold (sext_in_reg (sext x)) -> (sext x) // fold (sext_in_reg (aext x)) -> (sext x) // if x is small enough. if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getValueType().getScalarType().getSizeInBits() <= EVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits))) return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT); // fold operands of sext_in_reg based on knowledge that the top bits are not // demanded. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (sext_in_reg (load x)) -> (smaller sextload x) // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) SDValue NarrowLoad = ReduceLoadWidth(N); if (NarrowLoad.getNode()) return NarrowLoad; // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. if (N0.getOpcode() == ISD::SRL) { if (ConstantSDNode *ShAmt = dyn_cast(N0.getOperand(1))) if (ShAmt->getZExtValue()+EVTBits <= VTBits) { // We can turn this into an SRA iff the input to the SRL is already sign // extended enough. unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits) return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1)); } } // fold (sext_inreg (extload x)) -> (sextload x) if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), EVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); AddToWorklist(ExtLoad.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse() && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), EVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) { SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false); if (BSwap.getNode()) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); } // Fold a sext_inreg of a build_vector of ConstantSDNodes or undefs // into a build_vector. if (ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { SmallVector Elts; unsigned NumElts = N0->getNumOperands(); unsigned ShAmt = VTBits - EVTBits; for (unsigned i = 0; i != NumElts; ++i) { SDValue Op = N0->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { Elts.push_back(Op); continue; } ConstantSDNode *CurrentND = cast(Op); const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue()); Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(), Op.getValueType())); } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Elts); } return SDValue(); } SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); bool isLE = TLI.isLittleEndian(); // noop truncate if (N0.getValueType() == N->getValueType(0)) return N0; // fold (truncate c1) -> c1 if (isa(N0)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); // fold (truncate (truncate x)) -> (truncate x) if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // fold (truncate (ext x)) -> (ext x) or (truncate x) or x if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { if (N0.getOperand(0).getValueType().bitsLT(VT)) // if the source is smaller than the dest, we still need an extend return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); if (N0.getOperand(0).getValueType().bitsGT(VT)) // if the source is larger than the dest, than we just need the truncate return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // if the source and dest are the same type, we can drop both the extend // and the truncate. return N0.getOperand(0); } // Fold extract-and-trunc into a narrow extract. For example: // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) // i32 y = TRUNCATE(i64 x) // -- becomes -- // v16i8 b = BITCAST (v2i64 val) // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8) // // Note: We only run this optimization after type legalization (which often // creates this pattern) and before operation legalization after which // we need to be more careful about the vector instructions that we generate. if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { EVT VecTy = N0.getOperand(0).getValueType(); EVT ExTy = N0.getValueType(); EVT TrTy = N->getValueType(0); unsigned NumElem = VecTy.getVectorNumElements(); unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem); assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); SDValue EltNo = N0->getOperand(1); if (isa(EltNo) && isTypeLegal(NVT)) { int Elt = cast(EltNo)->getZExtValue(); EVT IndexTy = TLI.getVectorIdxTy(); int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); SDValue V = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, N0.getOperand(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), TrTy, V, DAG.getConstant(Index, IndexTy)); } } // trunc (select c, a, b) -> select c, (trunc a), (trunc b) if (N0.getOpcode() == ISD::SELECT) { EVT SrcVT = N0.getValueType(); if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && TLI.isTruncateFree(SrcVT, VT)) { SDLoc SL(N0); SDValue Cond = N0.getOperand(0); SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); } } // Fold a series of buildvector, bitcast, and truncate if possible. // For example fold // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to // (2xi32 (buildvector x, y)). if (Level == AfterLegalizeVectorOps && VT.isVector() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && N0.getOperand(0).hasOneUse()) { SDValue BuildVect = N0.getOperand(0); EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); EVT TruncVecEltTy = VT.getVectorElementType(); // Check that the element types match. if (BuildVectEltTy == TruncVecEltTy) { // Now we only need to compute the offset of the truncated elements. unsigned BuildVecNumElts = BuildVect.getNumOperands(); unsigned TruncVecNumElts = VT.getVectorNumElements(); unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; assert((BuildVecNumElts % TruncVecNumElts) == 0 && "Invalid number of elements"); SmallVector Opnds; for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) Opnds.push_back(BuildVect.getOperand(i)); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds); } } // See if we can simplify the input to this truncate through knowledge that // only the low bits are being used. // For example "trunc (or (shl x, 8), y)" // -> trunc y // Currently we only perform this optimization on scalars because vectors // may have different active low bits. if (!VT.isVector()) { SDValue Shorter = GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits())); if (Shorter.getNode()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { SDValue Reduced = ReduceLoadWidth(N); if (Reduced.getNode()) return Reduced; // Handle the case where the load remains an extending load even // after truncation. if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast(N0); if (!LN0->isVolatile() && LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) { SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1)); return NewLoad; } } } // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), // where ... are all 'undef'. if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { SmallVector VTs; SDValue V; unsigned Idx = 0; unsigned NumDefs = 0; for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { SDValue X = N0.getOperand(i); if (X.getOpcode() != ISD::UNDEF) { V = X; Idx = i; NumDefs++; } // Stop if more than one members are non-undef. if (NumDefs > 1) break; VTs.push_back(EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), X.getValueType().getVectorNumElements())); } if (NumDefs == 0) return DAG.getUNDEF(VT); if (NumDefs == 1) { assert(V.getNode() && "The single defined operand is empty!"); SmallVector Opnds; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { if (i != Idx) { Opnds.push_back(DAG.getUNDEF(VTs[i])); continue; } SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); AddToWorklist(NV.getNode()); Opnds.push_back(NV); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); } } // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } static SDNode *getBuildPairElt(SDNode *N, unsigned i) { SDValue Elt = N->getOperand(i); if (Elt.getOpcode() != ISD::MERGE_VALUES) return Elt.getNode(); return Elt.getOperand(Elt.getResNo()).getNode(); } /// build_pair (load, load) -> load /// if load locations are consecutive. SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { assert(N->getOpcode() == ISD::BUILD_PAIR); LoadSDNode *LD1 = dyn_cast(getBuildPairElt(N, 0)); LoadSDNode *LD2 = dyn_cast(getBuildPairElt(N, 1)); if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || LD1->getAddressSpace() != LD2->getAddressSpace()) return SDValue(); EVT LD1VT = LD1->getValueType(0); if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && // If both are volatile this would reduce the number of volatile loads. // If one is volatile it might be ok, but play conservative and bail out. !LD1->isVolatile() && !LD2->isVolatile() && DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) { unsigned Align = LD1->getAlignment(); unsigned NewAlign = TLI.getDataLayout()-> getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); if (NewAlign <= Align && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(), LD1->getPointerInfo(), false, false, false, Align); } return SDValue(); } SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // If the input is a BUILD_VECTOR with all constant elements, fold this now. // Only do this before legalize, since afterward the target may be depending // on the bitconvert. // First check to see if this is all constant. if (!LegalTypes && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && VT.isVector()) { bool isSimple = cast(N0)->isConstant(); EVT DestEltVT = N->getValueType(0).getVectorElementType(); assert(!DestEltVT.isVector() && "Element type of vector ValueType must not be vector!"); if (isSimple) return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT); } // If the input is a constant, let getNode fold it. if (isa(N0) || isa(N0)) { // If we can't allow illegal operations, we need to check that this is just // a fp -> int or int -> conversion and that the resulting operation will // be legal. if (!LegalOperations || (isa(N0) && VT.isFloatingPoint() && !VT.isVector() && TLI.isOperationLegal(ISD::ConstantFP, VT)) || (isa(N0) && VT.isInteger() && !VT.isVector() && TLI.isOperationLegal(ISD::Constant, VT))) return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0); } // (conv (conv x, t1), t2) -> (conv x, t2) if (N0.getOpcode() == ISD::BITCAST) return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0.getOperand(0)); // fold (conv (load x)) -> (load (conv*)x) // If the resultant load doesn't need a higher alignment than the original! if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile() && // Do not remove the cast if the types differ in endian layout. TLI.hasBigEndianPartOrdering(N0.getValueType()) == TLI.hasBigEndianPartOrdering(VT) && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) && TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { LoadSDNode *LN0 = cast(N0); unsigned Align = TLI.getDataLayout()-> getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); unsigned OrigAlign = LN0->getAlignment(); if (Align <= OrigAlign) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), OrigAlign, LN0->getAAInfo()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; } } // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) // This often reduces constant pool loads. if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector() && !N0.getValueType().isVector()) { SDValue NewConv = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT, N0.getOperand(0)); AddToWorklist(NewConv.getNode()); APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) return DAG.getNode(ISD::XOR, SDLoc(N), VT, NewConv, DAG.getConstant(SignBit, VT)); assert(N0.getOpcode() == ISD::FABS); return DAG.getNode(ISD::AND, SDLoc(N), VT, NewConv, DAG.getConstant(~SignBit, VT)); } // fold (bitconvert (fcopysign cst, x)) -> // (or (and (bitconvert x), sign), (and cst, (not sign))) // Note that we don't handle (copysign x, cst) because this can always be // folded to an fneg or fabs. if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && isa(N0.getOperand(0)) && VT.isInteger() && !VT.isVector()) { unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits(); EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); if (isTypeLegal(IntXVT)) { SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0), IntXVT, N0.getOperand(1)); AddToWorklist(X.getNode()); // If X has a different width than the result/lhs, sext it or truncate it. unsigned VTWidth = VT.getSizeInBits(); if (OrigXWidth < VTWidth) { X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); AddToWorklist(X.getNode()); } else if (OrigXWidth > VTWidth) { // To get the sign bit in the right place, we have to shift it right // before truncating. X = DAG.getNode(ISD::SRL, SDLoc(X), X.getValueType(), X, DAG.getConstant(OrigXWidth-VTWidth, X.getValueType())); AddToWorklist(X.getNode()); X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); AddToWorklist(X.getNode()); } APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, VT)); AddToWorklist(X.getNode()); SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT, N0.getOperand(0)); Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, Cst, DAG.getConstant(~SignBit, VT)); AddToWorklist(Cst.getNode()); return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); } } // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. if (N0.getOpcode() == ISD::BUILD_PAIR) { SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT); if (CombineLD.getNode()) return CombineLD; } return SDValue(); } SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) { EVT VT = N->getValueType(0); return CombineConsecutiveLoads(N, VT); } /// We know that BV is a build_vector node with Constant, ConstantFP or Undef /// operands. DstEltVT indicates the destination element value type. SDValue DAGCombiner:: ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { EVT SrcEltVT = BV->getValueType(0).getVectorElementType(); // If this is already the right type, we're done. if (SrcEltVT == DstEltVT) return SDValue(BV, 0); unsigned SrcBitSize = SrcEltVT.getSizeInBits(); unsigned DstBitSize = DstEltVT.getSizeInBits(); // If this is a conversion of N elements of one type to N elements of another // type, convert each element. This handles FP<->INT cases. if (SrcBitSize == DstBitSize) { EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, BV->getValueType(0).getVectorNumElements()); // Due to the FP element handling below calling this routine recursively, // we can end up with a scalar-to-vector node here. if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT, DAG.getNode(ISD::BITCAST, SDLoc(BV), DstEltVT, BV->getOperand(0))); SmallVector Ops; for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) { SDValue Op = BV->getOperand(i); // If the vector element type is not legal, the BUILD_VECTOR operands // are promoted and implicitly truncated. Make that explicit here. if (Op.getValueType() != SrcEltVT) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); Ops.push_back(DAG.getNode(ISD::BITCAST, SDLoc(BV), DstEltVT, Op)); AddToWorklist(Ops.back().getNode()); } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops); } // Otherwise, we're growing or shrinking the elements. To avoid having to // handle annoying details of growing/shrinking FP values, we convert them to // int first. if (SrcEltVT.isFloatingPoint()) { // Convert the input float vector to a int vector where the elements are the // same sizes. EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits()); BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode(); SrcEltVT = IntVT; } // Now we know the input is an integer vector. If the output is a FP type, // convert to integer first, then to FP of the right size. if (DstEltVT.isFloatingPoint()) { EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits()); SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode(); // Next, convert to FP elements of the same size. return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT); } // Okay, we know the src/dst types are both integers of differing types. // Handling growing first. assert(SrcEltVT.isInteger() && DstEltVT.isInteger()); if (SrcBitSize < DstBitSize) { unsigned NumInputsPerOutput = DstBitSize/SrcBitSize; SmallVector Ops; for (unsigned i = 0, e = BV->getNumOperands(); i != e; i += NumInputsPerOutput) { bool isLE = TLI.isLittleEndian(); APInt NewBits = APInt(DstBitSize, 0); bool EltIsUndef = true; for (unsigned j = 0; j != NumInputsPerOutput; ++j) { // Shift the previously computed bits over. NewBits <<= SrcBitSize; SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j)); if (Op.getOpcode() == ISD::UNDEF) continue; EltIsUndef = false; NewBits |= cast(Op)->getAPIntValue(). zextOrTrunc(SrcBitSize).zext(DstBitSize); } if (EltIsUndef) Ops.push_back(DAG.getUNDEF(DstEltVT)); else Ops.push_back(DAG.getConstant(NewBits, DstEltVT)); } EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size()); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops); } // Finally, this must be the case where we are shrinking elements: each input // turns into multiple outputs. bool isS2V = ISD::isScalarToVector(BV); unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, NumOutputsPerInput*BV->getNumOperands()); SmallVector Ops; for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) { if (BV->getOperand(i).getOpcode() == ISD::UNDEF) { for (unsigned j = 0; j != NumOutputsPerInput; ++j) Ops.push_back(DAG.getUNDEF(DstEltVT)); continue; } APInt OpVal = cast(BV->getOperand(i))-> getAPIntValue().zextOrTrunc(SrcBitSize); for (unsigned j = 0; j != NumOutputsPerInput; ++j) { APInt ThisVal = OpVal.trunc(DstBitSize); Ops.push_back(DAG.getConstant(ThisVal, DstEltVT)); if (isS2V && i == 0 && j == 0 && ThisVal.zext(SrcBitSize) == OpVal) // Simply turn this into a SCALAR_TO_VECTOR of the new type. return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT, Ops[0]); OpVal = OpVal.lshr(DstBitSize); } // For big endian targets, swap the order of the pieces of each element. if (TLI.isBigEndian()) std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops); } SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); const TargetOptions &Options = DAG.getTarget().Options; // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // fold (fadd c1, c2) -> c1 + c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N1); // canonicalize constant to RHS if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N0); // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, GetNegatedExpression(N1, DAG, LegalOperations)); // fold (fadd (fneg A), B) -> (fsub B, A) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N1, GetNegatedExpression(N0, DAG, LegalOperations)); // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { // No FP constant should be created after legalization as Instruction // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); // fold (fadd A, 0) -> A if (N1CFP && N1CFP->getValueAPF().isZero()) return N0; // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2)) if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() && isa(N0.getOperand(1))) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(0), DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(1), N1)); // If allowed, fold (fadd (fneg x), x) -> 0.0 if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) return DAG.getConstantFP(0.0, VT); // If allowed, fold (fadd x, (fneg x)) -> 0.0 if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) return DAG.getConstantFP(0.0, VT); // We can fold chains of FADD's of the same value into multiplications. // This transform is not safe in general because we are reducing the number // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { ConstantFPSDNode *CFP00 = dyn_cast(N0.getOperand(0)); ConstantFPSDNode *CFP01 = dyn_cast(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT, SDValue(CFP01, 0), DAG.getConstantFP(1.0, VT)); return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, NewCFP); } // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT, SDValue(CFP01, 0), DAG.getConstantFP(2.0, VT)); return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), NewCFP); } } if (N1.getOpcode() == ISD::FMUL) { ConstantFPSDNode *CFP10 = dyn_cast(N1.getOperand(0)); ConstantFPSDNode *CFP11 = dyn_cast(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT, SDValue(CFP11, 0), DAG.getConstantFP(1.0, VT)); return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, NewCFP); } // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N0.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT, SDValue(CFP11, 0), DAG.getConstantFP(2.0, VT)); return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1.getOperand(0), NewCFP); } } if (N0.getOpcode() == ISD::FADD && AllowNewConst) { ConstantFPSDNode *CFP = dyn_cast(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, DAG.getConstantFP(3.0, VT)); } if (N1.getOpcode() == ISD::FADD && AllowNewConst) { ConstantFPSDNode *CFP10 = dyn_cast(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, DAG.getConstantFP(3.0, VT)); } // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0) if (AllowNewConst && N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), DAG.getConstantFP(4.0, VT)); } } // enable-unsafe-fp-math // FADD -> FMA combines: if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (N0.getOpcode() == ISD::FMUL && (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), N1); // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FMUL && (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1.getOperand(0), N1.getOperand(1), N0); // When FP_EXTEND nodes are free on the target, and there is an opportunity // to combine into FMA, arrange such nodes accordingly. if (TLI.isFPExtFree(VT)) { // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N00.getOperand(1)), N1); } // fold (fadd x, (fpext (fmul y, z)), z) -> (fma (fpext y), (fpext z), x) // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N10.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N10.getOperand(1)), N0); } } // More folding opportunities when target permits. if (TLI.enableAggressiveFMAFusion(VT)) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) if (N0.getOpcode() == ISD::FMA && N0.getOperand(2).getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), N1)); // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) if (N1->getOpcode() == ISD::FMA && N1.getOperand(2).getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1.getOperand(0), N1.getOperand(1), DAG.getNode(ISD::FMA, SDLoc(N), VT, N1.getOperand(2).getOperand(0), N1.getOperand(2).getOperand(1), N0)); } } return SDValue(); } SDValue DAGCombiner::visitFSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); SDLoc dl(N); const TargetOptions &Options = DAG.getTarget().Options; // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // fold (fsub c1, c2) -> c1-c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, N1); // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, dl, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations)); // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { // (fsub A, 0) -> A if (N1CFP && N1CFP->getValueAPF().isZero()) return N0; // (fsub 0, B) -> -B if (N0CFP && N0CFP->getValueAPF().isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return GetNegatedExpression(N1, DAG, LegalOperations); if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, dl, VT, N1); } // (fsub x, x) -> 0.0 if (N0 == N1) return DAG.getConstantFP(0.0f, VT); // (fsub x, (fadd x, y)) -> (fneg y) // (fsub x, (fadd y, x)) -> (fneg y) if (N1.getOpcode() == ISD::FADD) { SDValue N10 = N1->getOperand(0); SDValue N11 = N1->getOperand(1); if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options)) return GetNegatedExpression(N11, DAG, LegalOperations); if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options)) return GetNegatedExpression(N10, DAG, LegalOperations); } } // FSUB -> FMA combines: if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (N0.getOpcode() == ISD::FMUL && (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) return DAG.getNode(ISD::FMA, dl, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, dl, VT, N1)); // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FMUL && (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) return DAG.getNode(ISD::FMA, dl, VT, DAG.getNode(ISD::FNEG, dl, VT, N1.getOperand(0)), N1.getOperand(1), N0); // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0).getOpcode() == ISD::FMUL && ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) || TLI.enableAggressiveFMAFusion(VT))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); return DAG.getNode(ISD::FMA, dl, VT, DAG.getNode(ISD::FNEG, dl, VT, N00), N01, DAG.getNode(ISD::FNEG, dl, VT, N1)); } // When FP_EXTEND nodes are free on the target, and there is an opportunity // to combine into FMA, arrange such nodes accordingly. if (TLI.isFPExtFree(VT)) { // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N00.getOperand(1)), DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1)); } // fold (fsub x, (fpext (fmul y, z))) // -> (fma (fneg (fpext y)), (fpext z), x) // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, DAG.getNode(ISD::FNEG, SDLoc(N), VT, DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N10.getOperand(0))), DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N10.getOperand(1)), N0); } // fold (fsub (fpext (fneg (fmul, x, y))), z) // -> (fma (fneg (fpext x)), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); if (N000.getOpcode() == ISD::FMUL) { return DAG.getNode(ISD::FMA, dl, VT, DAG.getNode(ISD::FNEG, dl, VT, DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N000.getOperand(0))), DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N000.getOperand(1)), DAG.getNode(ISD::FNEG, dl, VT, N1)); } } } // fold (fsub (fneg (fpext (fmul, x, y))), z) // -> (fma (fneg (fpext x)), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FNEG) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); if (N000.getOpcode() == ISD::FMUL) { return DAG.getNode(ISD::FMA, dl, VT, DAG.getNode(ISD::FNEG, dl, VT, DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N000.getOperand(0))), DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N000.getOperand(1)), DAG.getNode(ISD::FNEG, dl, VT, N1)); } } } } // More folding opportunities when target permits. if (TLI.enableAggressiveFMAFusion(VT)) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) if (N0.getOpcode() == ISD::FMA && N0.getOperand(2).getOpcode() == ISD::FMUL) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1))); // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) if (N1.getOpcode() == ISD::FMA && N1.getOperand(2).getOpcode() == ISD::FMUL) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(ISD::FMA, SDLoc(N), VT, DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1.getOperand(0)), N1.getOperand(1), DAG.getNode(ISD::FMA, SDLoc(N), VT, DAG.getNode(ISD::FNEG, SDLoc(N), VT, N20), N21, N0)); } } } return SDValue(); } SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); const TargetOptions &Options = DAG.getTarget().Options; // fold vector ops if (VT.isVector()) { // This just handles C1 * C2 for vectors. Other vector folds are below. SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; // Canonicalize vector constant to RHS. if (N0.getOpcode() == ISD::BUILD_VECTOR && N1.getOpcode() != ISD::BUILD_VECTOR) if (auto *BV0 = dyn_cast(N0)) if (BV0->isConstant()) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); } // fold (fmul c1, c2) -> c1*c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, N1); // canonicalize constant to RHS if (N0CFP && !N1CFP) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, N0); // fold (fmul A, 1.0) -> A if (N1CFP && N1CFP->isExactlyValue(1.0)) return N0; if (Options.UnsafeFPMath) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->getValueAPF().isZero()) return N1; // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2)) if (N0.getOpcode() == ISD::FMUL) { // Fold scalars or any vector constants (not just splats). // This fold is done in general by InstCombine, but extra fmul insts // may have been generated during lowering. SDValue N01 = N0.getOperand(1); auto *BV1 = dyn_cast(N1); auto *BV01 = dyn_cast(N01); if ((N1CFP && isConstOrConstSplatFP(N01)) || (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) { SDLoc SL(N); SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N01, N1); return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts); } } // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs // during an early run of DAGCombiner can prevent folding with fmuls // inserted during lowering. if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) { SDLoc SL(N); const SDValue Two = DAG.getConstantFP(2.0, VT); SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, Two, N1); return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), MulConsts); } } // fold (fmul X, 2.0) -> (fadd X, X) if (N1CFP && N1CFP->isExactlyValue(+2.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N0); // fold (fmul X, -1.0) -> (fneg X) if (N1CFP && N1CFP->isExactlyValue(-1.0)) if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y) if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, GetNegatedExpression(N0, DAG, LegalOperations), GetNegatedExpression(N1, DAG, LegalOperations)); } } return SDValue(); } SDValue DAGCombiner::visitFMA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc dl(N); const TargetOptions &Options = DAG.getTarget().Options; // Constant fold FMA. if (isa(N0) && isa(N1) && isa(N2)) { return DAG.getNode(ISD::FMA, dl, VT, N0, N1, N2); } if (Options.UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) return N2; } if (N0CFP && N0CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) if (N0CFP && !N1CFP) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (Options.UnsafeFPMath && N1CFP && N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && N2.getOperand(1).getOpcode() == ISD::ConstantFP) { return DAG.getNode(ISD::FMUL, dl, VT, N0, DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1))); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) if (Options.UnsafeFPMath && N0.getOpcode() == ISD::FMUL && N1CFP && N0.getOperand(1).getOpcode() == ISD::ConstantFP) { return DAG.getNode(ISD::FMA, dl, VT, N0.getOperand(0), DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1)), N2); } // (fma x, 1, y) -> (fadd x, y) // (fma x, -1, y) -> (fadd (fneg x), y) if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, dl, VT, N0, N2); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0); AddToWorklist(RHSNeg.getNode()); return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg); } } // (fma x, c, x) -> (fmul x, (c+1)) if (Options.UnsafeFPMath && N1CFP && N0 == N2) return DAG.getNode(ISD::FMUL, dl, VT, N0, DAG.getNode(ISD::FADD, dl, VT, N1, DAG.getConstantFP(1.0, VT))); // (fma x, c, (fneg x)) -> (fmul x, (c-1)) if (Options.UnsafeFPMath && N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) return DAG.getNode(ISD::FMUL, dl, VT, N0, DAG.getNode(ISD::FADD, dl, VT, N1, DAG.getConstantFP(-1.0, VT))); return SDValue(); } SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; // fold vector ops if (VT.isVector()) { SDValue FoldedVOp = SimplifyVBinOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // fold (fdiv c1, c2) -> c1/c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1); if (Options.UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { // Compute the reciprocal 1.0 / c2. APFloat N1APF = N1CFP->getValueAPF(); APFloat Recip(N1APF.getSemantics(), 1); // 1.0 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven); // Only do the transform if the reciprocal is a legal fp immediate that // isn't too nasty (eg NaN, denormal, ...). if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty (!LegalOperations || // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM // backend)... we should handle this gracefully after Legalize. // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) || TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) || TLI.isFPImmLegal(Recip, VT))) return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, DAG.getConstantFP(Recip, VT)); } // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) { return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } else if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } else if (N1.getOpcode() == ISD::FP_ROUND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } else if (N1.getOpcode() == ISD::FMUL) { // Look through an FMUL. Even though this won't remove the FDIV directly, // it's still worthwhile to get rid of the FSQRT if possible. SDValue SqrtOp; SDValue OtherOp; if (N1.getOperand(0).getOpcode() == ISD::FSQRT) { SqrtOp = N1.getOperand(0); OtherOp = N1.getOperand(1); } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) { SqrtOp = N1.getOperand(1); OtherOp = N1.getOperand(0); } if (SqrtOp.getNode()) { // We found a FSQRT, so try to make this fold: // x / (y * sqrt(z)) -> x * (rsqrt(z) / y) if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) { RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } } // Fold into a reciprocal estimate and multiply instead of a real divide. if (SDValue RV = BuildReciprocalEstimate(N1)) { AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, GetNegatedExpression(N0, DAG, LegalOperations), GetNegatedExpression(N1, DAG, LegalOperations)); } } // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal. // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) // Notice that this is not always beneficial. One reason is different target // may have different costs for FDIV and FMUL, so sometimes the cost of two // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". if (Options.UnsafeFPMath) { // Skip if current node is a reciprocal. if (N0CFP && N0CFP->isExactlyValue(1.0)) return SDValue(); SmallVector Users; // Find all FDIV users of the same divisor. for (SDNode::use_iterator UI = N1.getNode()->use_begin(), UE = N1.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = UI.getUse().getUser(); if (User->getOpcode() == ISD::FDIV && User->getOperand(1) == N1) Users.push_back(User); } if (TLI.combineRepeatedFPDivisors(Users.size())) { SDValue FPOne = DAG.getConstantFP(1.0, VT); // floating point 1.0 SDValue Reciprocal = DAG.getNode(ISD::FDIV, SDLoc(N), VT, FPOne, N1); // Dividend / Divisor -> Dividend * Reciprocal for (auto I = Users.begin(), E = Users.end(); I != E; ++I) { if ((*I)->getOperand(0) != FPOne) { SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(*I), VT, (*I)->getOperand(0), Reciprocal); DAG.ReplaceAllUsesWith(*I, NewNode.getNode()); } } return SDValue(); } } return SDValue(); } SDValue DAGCombiner::visitFREM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1); return SDValue(); } SDValue DAGCombiner::visitFSQRT(SDNode *N) { if (DAG.getTarget().Options.UnsafeFPMath && !TLI.isFsqrtCheap()) { // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5) if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) { EVT VT = RV.getValueType(); RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV); AddToWorklist(RV.getNode()); // Unfortunately, RV is now NaN if the input was exactly 0. // Select out this case and force the answer to 0. SDValue Zero = DAG.getConstantFP(0.0, VT); SDValue ZeroCmp = DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT), N->getOperand(0), Zero, ISD::SETEQ); AddToWorklist(ZeroCmp.getNode()); AddToWorklist(RV.getNode()); RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, SDLoc(N), VT, ZeroCmp, Zero, RV); return RV; } } return SDValue(); } SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); if (N0CFP && N1CFP) // Constant fold return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); if (N1CFP) { const APFloat& V = N1CFP->getValueAPF(); // copysign(x, c1) -> fabs(x) iff ispos(c1) // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) if (!V.isNegative()) { if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); } else { if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); } } // copysign(fabs(x), y) -> copysign(x, y) // copysign(fneg(x), y) -> copysign(x, y) // copysign(copysign(x,z), y) -> copysign(x, y) if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); // copysign(x, abs(y)) -> abs(x) if (N1.getOpcode() == ISD::FABS) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // copysign(x, copysign(y,z)) -> copysign(x, z) if (N1.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); // copysign(x, fp_extend(y)) -> copysign(x, y) // copysign(x, fp_round(y)) -> copysign(x, y) if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantSDNode *N0C = dyn_cast(N0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); // fold (sint_to_fp c1) -> c1fp if (N0C && // ...but only if the target supports immediate floating-point values (!LegalOperations || TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and SINT_TO_FP is not legal on this target, // but UINT_TO_FP is legal on this target, try to convert. if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) && TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to UINT_TO_FP. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && !VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops); } // fold (sint_to_fp (zext (setcc x, y, cc))) -> // (select_cc x, y, 1.0, 0.0,, cc) if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { SDValue Ops[] = { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT), N0.getOperand(0).getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops); } } return SDValue(); } SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantSDNode *N0C = dyn_cast(N0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); // fold (uint_to_fp c1) -> c1fp if (N0C && // ...but only if the target supports immediate floating-point values (!LegalOperations || TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and UINT_TO_FP is not legal on this target, // but SINT_TO_FP is legal on this target, try to convert. if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) && TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to SINT_TO_FP. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), DAG.getConstantFP(1.0, VT), DAG.getConstantFP(0.0, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops); } } return SDValue(); } SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (fp_to_sint c1fp) -> c1 if (N0CFP) return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (fp_to_uint c1fp) -> c1 if (N0CFP) return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (fp_round c1fp) -> c1fp if (N0CFP) return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1); // fold (fp_round (fp_extend x)) -> x if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType()) return N0.getOperand(0); // fold (fp_round (fp_round x)) -> (fp_round x) if (N0.getOpcode() == ISD::FP_ROUND) { // This is a value preserving truncation if both round's are. bool IsTrunc = N->getConstantOperandVal(1) == 1 && N0.getNode()->getConstantOperandVal(1) == 1; return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0), DAG.getIntPtrConstant(IsTrunc)); } // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(Tmp.getNode()); return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, Tmp, N0.getOperand(1)); } return SDValue(); } SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT EVT = cast(N->getOperand(1))->getVT(); ConstantFPSDNode *N0CFP = dyn_cast(N0); // fold (fp_round_inreg c1fp) -> c1fp if (N0CFP && isTypeLegal(EVT)) { SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), EVT); return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Round); } return SDValue(); } SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) return SDValue(); // fold (fp_extend c1fp) -> c1fp if (N0CFP) return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the // value of X. if (N0.getOpcode() == ISD::FP_ROUND && N0.getNode()->getConstantOperandVal(1) == 1) { SDValue In = N0.getOperand(0); if (In.getValueType() == VT) return In; if (VT.bitsLT(In.getValueType())) return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, In, N0.getOperand(1)); return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); } // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1)), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } return SDValue(); } SDValue DAGCombiner::visitFCEIL(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (fceil c1) -> fceil(c1) if (N0CFP) return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitFTRUNC(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (ftrunc c1) -> ftrunc(c1) if (N0CFP) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitFFLOOR(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (ffloor c1) -> ffloor(c1) if (N0CFP) return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); return SDValue(); } // FIXME: FNEG and FABS have a lot in common; refactor. SDValue DAGCombiner::visitFNEG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (VT.isVector()) { SDValue FoldedVOp = SimplifyVUnaryOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // Constant fold FNEG. if (isa(N0)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N->getOperand(0)); if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(), &DAG.getTarget().Options)) return GetNegatedExpression(N0, DAG, LegalOperations); // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { APInt SignMask; if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x80... per scalar element // and splat it. SignMask = APInt::getSignBit(N0.getValueType().getScalarSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x80... SignMask = APInt::getSignBit(IntVT.getSizeInBits()); } Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int, DAG.getConstant(SignMask, IntVT)); AddToWorklist(Int.getNode()); return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int); } } // (fneg (fmul c, x)) -> (fmul -c, x) if (N0.getOpcode() == ISD::FMUL) { ConstantFPSDNode *CFP1 = dyn_cast(N0.getOperand(1)); if (CFP1) { APFloat CVal = CFP1->getValueAPF(); CVal.changeSign(); if (Level >= AfterLegalizeDAG && (TLI.isFPImmLegal(CVal, N->getValueType(0)) || TLI.isOperationLegal(ISD::ConstantFP, N->getValueType(0)))) return DAG.getNode( ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1))); } } return SDValue(); } SDValue DAGCombiner::visitFMINNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); const ConstantFPSDNode *N0CFP = dyn_cast(N0); const ConstantFPSDNode *N1CFP = dyn_cast(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); return DAG.getConstantFP(minnum(C0, C1), N->getValueType(0)); } if (N0CFP) { EVT VT = N->getValueType(0); // Canonicalize to constant on RHS. return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0); } return SDValue(); } SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); const ConstantFPSDNode *N0CFP = dyn_cast(N0); const ConstantFPSDNode *N1CFP = dyn_cast(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); return DAG.getConstantFP(maxnum(C0, C1), N->getValueType(0)); } if (N0CFP) { EVT VT = N->getValueType(0); // Canonicalize to constant on RHS. return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0); } return SDValue(); } SDValue DAGCombiner::visitFABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (VT.isVector()) { SDValue FoldedVOp = SimplifyVUnaryOp(N); if (FoldedVOp.getNode()) return FoldedVOp; } // fold (fabs c1) -> fabs(c1) if (isa(N0)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // fold (fabs (fabs x)) -> (fabs x) if (N0.getOpcode() == ISD::FABS) return N->getOperand(0); // fold (fabs (fneg x)) -> (fabs x) // fold (fabs (fcopysign x, y)) -> (fabs x) if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading // constant pool values. if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { APInt SignMask; if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x7f... per scalar element // and splat it. SignMask = ~APInt::getSignBit(N0.getValueType().getScalarSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x7f... SignMask = ~APInt::getSignBit(IntVT.getSizeInBits()); } Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int, DAG.getConstant(SignMask, IntVT)); AddToWorklist(Int.getNode()); return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int); } } return SDValue(); } SDValue DAGCombiner::visitBRCOND(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. // If we did this folding here, it would be necessary to update the // MachineBasicBlock CFG, which is awkward. // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal // on the target. if (N1.getOpcode() == ISD::SETCC && TLI.isOperationLegalOrCustom(ISD::BR_CC, N1.getOperand(0).getValueType())) { return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain, N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2); } if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) || ((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) && (N1.getOperand(0).hasOneUse() && N1.getOperand(0).getOpcode() == ISD::SRL))) { SDNode *Trunc = nullptr; if (N1.getOpcode() == ISD::TRUNCATE) { // Look pass the truncate. Trunc = N1.getNode(); N1 = N1.getOperand(0); } // Match this pattern so that we can generate simpler code: // // %a = ... // %b = and i32 %a, 2 // %c = srl i32 %b, 1 // brcond i32 %c ... // // into // // %a = ... // %b = and i32 %a, 2 // %c = setcc eq %b, 0 // brcond %c ... // // This applies only when the AND constant value has one bit set and the // SRL constant is equal to the log2 of the AND constant. The back-end is // smart enough to convert the result into a TEST/JMP sequence. SDValue Op0 = N1.getOperand(0); SDValue Op1 = N1.getOperand(1); if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) { SDValue AndOp1 = Op0.getOperand(1); if (AndOp1.getOpcode() == ISD::Constant) { const APInt &AndConst = cast(AndOp1)->getAPIntValue(); if (AndConst.isPowerOf2() && cast(Op1)->getAPIntValue()==AndConst.logBase2()) { SDValue SetCC = DAG.getSetCC(SDLoc(N), getSetCCResultType(Op0.getValueType()), Op0, DAG.getConstant(0, Op0.getValueType()), ISD::SETNE); SDValue NewBRCond = DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, SetCC, N2); // Don't add the new BRCond into the worklist or else SimplifySelectCC // will convert it back to (X & C1) >> C2. CombineTo(N, NewBRCond, false); // Truncate is dead. if (Trunc) deleteAndRecombine(Trunc); // Replace the uses of SRL with SETCC WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, SetCC); deleteAndRecombine(N1.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } if (Trunc) // Restore N1 if the above transformation doesn't match. N1 = N->getOperand(1); } // Transform br(xor(x, y)) -> br(x != y) // Transform br(xor(xor(x,y), 1)) -> br (x == y) if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) { SDNode *TheXor = N1.getNode(); SDValue Op0 = TheXor->getOperand(0); SDValue Op1 = TheXor->getOperand(1); if (Op0.getOpcode() == Op1.getOpcode()) { // Avoid missing important xor optimizations. SDValue Tmp = visitXOR(TheXor); if (Tmp.getNode()) { if (Tmp.getNode() != TheXor) { DEBUG(dbgs() << "\nReplacing.8 "; TheXor->dump(&DAG); dbgs() << "\nWith: "; Tmp.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, Tmp); deleteAndRecombine(TheXor); return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, Tmp, N2); } // visitXOR has changed XOR's operands or replaced the XOR completely, // bail out. return SDValue(N, 0); } } if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) { bool Equal = false; if (ConstantSDNode *RHSCI = dyn_cast(Op0)) if (RHSCI->getAPIntValue() == 1 && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR) { TheXor = Op0.getNode(); Equal = true; } EVT SetCCVT = N1.getValueType(); if (LegalTypes) SetCCVT = getSetCCResultType(SetCCVT); SDValue SetCC = DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1, Equal ? ISD::SETEQ : ISD::SETNE); // Replace the uses of XOR with SETCC WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, SetCC); deleteAndRecombine(N1.getNode()); return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, SetCC, N2); } } return SDValue(); } // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB. // SDValue DAGCombiner::visitBR_CC(SDNode *N) { CondCodeSDNode *CC = cast(N->getOperand(1)); SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3); // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. // If we did this folding here, it would be necessary to update the // MachineBasicBlock CFG, which is awkward. // Use SimplifySetCC to simplify SETCC's. SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), CondLHS, CondRHS, CC->get(), SDLoc(N), false); if (Simp.getNode()) AddToWorklist(Simp.getNode()); // fold to a simpler setcc if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, N->getOperand(0), Simp.getOperand(2), Simp.getOperand(0), Simp.getOperand(1), N->getOperand(4)); return SDValue(); } /// Return true if 'Use' is a load or a store that uses N as its base pointer /// and that N may be folded in the load / store addressing mode. static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, const TargetLowering &TLI) { EVT VT; if (LoadSDNode *LD = dyn_cast(Use)) { if (LD->isIndexed() || LD->getBasePtr().getNode() != N) return false; VT = Use->getValueType(0); } else if (StoreSDNode *ST = dyn_cast(Use)) { if (ST->isIndexed() || ST->getBasePtr().getNode() != N) return false; VT = ST->getValue().getValueType(); } else return false; TargetLowering::AddrMode AM; if (N->getOpcode() == ISD::ADD) { ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); if (Offset) // [reg +/- imm] AM.BaseOffs = Offset->getSExtValue(); else // [reg +/- reg] AM.Scale = 1; } else if (N->getOpcode() == ISD::SUB) { ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); if (Offset) // [reg +/- imm] AM.BaseOffs = -Offset->getSExtValue(); else // [reg +/- reg] AM.Scale = 1; } else return false; return TLI.isLegalAddressingMode(AM, VT.getTypeForEVT(*DAG.getContext())); } /// Try turning a load/store into a pre-indexed load/store when the base /// pointer is an add or subtract and it has other uses besides the load/store. /// After the transformation, the new indexed load/store has effectively folded /// the add/subtract in and all of its other uses are redirected to the /// new load/store. bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; bool isLoad = true; SDValue Ptr; EVT VT; if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; VT = LD->getMemoryVT(); if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) && !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT)) return false; Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; VT = ST->getMemoryVT(); if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) && !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT)) return false; Ptr = ST->getBasePtr(); isLoad = false; } else { return false; } // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail // out. There is no reason to make this a preinc/predec. if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) || Ptr.getNode()->hasOneUse()) return false; // Ask the target to do addressing mode selection. SDValue BasePtr; SDValue Offset; ISD::MemIndexedMode AM = ISD::UNINDEXED; if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) return false; // Backends without true r+i pre-indexed forms may need to pass a // constant base with a variable offset so that constant coercion // will work with the patterns in canonical form. bool Swapped = false; if (isa(BasePtr)) { std::swap(BasePtr, Offset); Swapped = true; } // Don't create a indexed load / store with zero offset. if (isa(Offset) && cast(Offset)->isNullValue()) return false; // Try turning it into a pre-indexed load / store except when: // 1) The new base ptr is a frame index. // 2) If N is a store and the new base ptr is either the same as or is a // predecessor of the value being stored. // 3) Another use of old base ptr is a predecessor of N. If ptr is folded // that would create a cycle. // 4) All uses are load / store ops that use it as old base ptr. // Check #1. Preinc'ing a frame index would require copying the stack pointer // (plus the implicit offset) to a register to preinc anyway. if (isa(BasePtr) || isa(BasePtr)) return false; // Check #2. if (!isLoad) { SDValue Val = cast(N)->getValue(); if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode())) return false; } // If the offset is a constant, there may be other adds of constants that // can be folded with this one. We should do this to avoid having to keep // a copy of the original base pointer. SmallVector OtherUses; if (isa(Offset)) for (SDNode *Use : BasePtr.getNode()->uses()) { if (Use == Ptr.getNode()) continue; if (Use->isPredecessorOf(N)) continue; if (Use->getOpcode() != ISD::ADD && Use->getOpcode() != ISD::SUB) { OtherUses.clear(); break; } SDValue Op0 = Use->getOperand(0), Op1 = Use->getOperand(1); if (Op1.getNode() == BasePtr.getNode()) std::swap(Op0, Op1); assert(Op0.getNode() == BasePtr.getNode() && "Use of ADD/SUB but not an operand"); if (!isa(Op1)) { OtherUses.clear(); break; } // FIXME: In some cases, we can be smarter about this. if (Op1.getValueType() != Offset.getValueType()) { OtherUses.clear(); break; } OtherUses.push_back(Use); } if (Swapped) std::swap(BasePtr, Offset); // Now check for #3 and #4. bool RealUse = false; // Caches for hasPredecessorHelper SmallPtrSet Visited; SmallVector Worklist; for (SDNode *Use : Ptr.getNode()->uses()) { if (Use == N) continue; if (N->hasPredecessorHelper(Use, Visited, Worklist)) return false; // If Ptr may be folded in addressing mode of other use, then it's // not profitable to do this transformation. if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI)) RealUse = true; } if (!RealUse) return false; SDValue Result; if (isLoad) Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); else Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); ++PreIndexedNodes; ++NodesCombined; DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. deleteAndRecombine(N); if (Swapped) std::swap(BasePtr, Offset); // Replace other uses of BasePtr that can be updated to use Ptr for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { unsigned OffsetIdx = 1; if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) OffsetIdx = 0; assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == BasePtr.getNode() && "Expected BasePtr operand"); // We need to replace ptr0 in the following expression: // x0 * offset0 + y0 * ptr0 = t0 // knowing that // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) // // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the // indexed load/store and the expresion that needs to be re-written. // // Therefore, we have: // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 ConstantSDNode *CN = cast(OtherUses[i]->getOperand(OffsetIdx)); int X0, X1, Y0, Y1; APInt Offset0 = CN->getAPIntValue(); APInt Offset1 = cast(Offset)->getAPIntValue(); X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; APInt CNV = Offset0; if (X0 < 0) CNV = -CNV; if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1; else CNV = CNV - Offset1; // We can now generate the new expression. SDValue NewOp1 = DAG.getConstant(CNV, CN->getValueType(0)); SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0); SDValue NewUse = DAG.getNode(Opcode, SDLoc(OtherUses[i]), OtherUses[i]->getValueType(0), NewOp1, NewOp2); DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); deleteAndRecombine(OtherUses[i]); } // Replace the uses of Ptr with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Ptr.getNode()); return true; } /// Try to combine a load/store with a add/sub of the base pointer node into a /// post-indexed load/store. The transformation folded the add/subtract into the /// new indexed load/store effectively and all of its uses are redirected to the /// new load/store. bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; bool isLoad = true; SDValue Ptr; EVT VT; if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; VT = LD->getMemoryVT(); if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) && !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT)) return false; Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; VT = ST->getMemoryVT(); if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) && !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT)) return false; Ptr = ST->getBasePtr(); isLoad = false; } else { return false; } if (Ptr.getNode()->hasOneUse()) return false; for (SDNode *Op : Ptr.getNode()->uses()) { if (Op == N || (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)) continue; SDValue BasePtr; SDValue Offset; ISD::MemIndexedMode AM = ISD::UNINDEXED; if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) { // Don't create a indexed load / store with zero offset. if (isa(Offset) && cast(Offset)->isNullValue()) continue; // Try turning it into a post-indexed load / store except when // 1) All uses are load / store ops that use it as base ptr (and // it may be folded as addressing mmode). // 2) Op must be independent of N, i.e. Op is neither a predecessor // nor a successor of N. Otherwise, if Op is folded that would // create a cycle. if (isa(BasePtr) || isa(BasePtr)) continue; // Check for #1. bool TryNext = false; for (SDNode *Use : BasePtr.getNode()->uses()) { if (Use == Ptr.getNode()) continue; // If all the uses are load / store addresses, then don't do the // transformation. if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){ bool RealUse = false; for (SDNode *UseUse : Use->uses()) { if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) RealUse = true; } if (!RealUse) { TryNext = true; break; } } } if (TryNext) continue; // Check for #2 if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) { SDValue Result = isLoad ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM) : DAG.getIndexedStore(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); ++PostIndexedNodes; ++NodesCombined; DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. deleteAndRecombine(N); // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Op); return true; } } } return false; } /// \brief Return the base-pointer arithmetic from an indexed \p LD. SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { ISD::MemIndexedMode AM = LD->getAddressingMode(); assert(AM != ISD::UNINDEXED); SDValue BP = LD->getOperand(1); SDValue Inc = LD->getOperand(2); // Some backends use TargetConstants for load offsets, but don't expect // TargetConstants in general ADD nodes. We can convert these constants into // regular Constants (if the constant is not opaque). assert((Inc.getOpcode() != ISD::TargetConstant || !cast(Inc)->isOpaque()) && "Cannot split out indexing using opaque target constants"); if (Inc.getOpcode() == ISD::TargetConstant) { ConstantSDNode *ConstInc = cast(Inc); Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), ConstInc->getValueType(0)); } unsigned Opc = (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB); return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } SDValue DAGCombiner::visitLOAD(SDNode *N) { LoadSDNode *LD = cast(N); SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); // If load is not volatile and there are no uses of the loaded value (and // the updated indexed value in case of indexed loads), change uses of the // chain value into uses of the chain input (i.e. delete the dead load). if (!LD->isVolatile()) { if (N->getValueType(1) == MVT::Other) { // Unindexed loads. if (!N->hasAnyUseOfValue(0)) { // It's not safe to use the two value CombineTo variant here. e.g. // v1, chain2 = load chain1, loc // v2, chain3 = load chain2, loc // v3 = add v2, c // Now we replace use of chain2 with chain1. This makes the second load // isomorphic to the one we are deleting, and thus makes this load live. DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG); dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); dbgs() << "\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); if (N->use_empty()) deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } else { // Indexed loads. assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?"); // If this load has an opaque TargetConstant offset, then we cannot split // the indexing into an add/sub directly (that TargetConstant may not be // valid for a different type of node, and we cannot convert an opaque // target constant into a regular constant). bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant && cast(LD->getOperand(2))->isOpaque(); if (!N->hasAnyUseOfValue(0) && ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) { SDValue Undef = DAG.getUNDEF(N->getValueType(0)); SDValue Index; if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) { Index = SplitIndexingFromLoad(LD); // Try to fold the base pointer arithmetic into subsequent loads and // stores. AddUsersToWorklist(N); } else Index = DAG.getUNDEF(N->getValueType(1)); DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG); dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); dbgs() << " and 2 other values\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } // If this load is directly stored, replace the load value with the stored // value. // TODO: Handle store large -> read small portion. // TODO: Handle TRUNCSTORE/LOADEXT if (ISD::isNormalLoad(N) && !LD->isVolatile()) { if (ISD::isNON_TRUNCStore(Chain.getNode())) { StoreSDNode *PrevST = cast(Chain); if (PrevST->getBasePtr() == Ptr && PrevST->getValue().getValueType() == N->getValueType(0)) return CombineTo(N, Chain.getOperand(1), Chain); } } // Try to infer better alignment information than the load already has. if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > LD->getMemOperand()->getBaseAlignment()) { SDValue NewLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr, LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), Align, LD->getAAInfo()); return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true); } } } bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA : DAG.getSubtarget().useAA(); #ifndef NDEBUG if (CombinerAAOnlyFunc.getNumOccurrences() && CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif if (UseAA && LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); // If there is a better chain. if (Chain != BetterChain) { SDValue ReplLoad; // Replace the chain to void dependency. if (LD->getExtensionType() == ISD::NON_EXTLOAD) { ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD), BetterChain, Ptr, LD->getMemOperand()); } else { ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), LD->getValueType(0), BetterChain, Ptr, LD->getMemoryVT(), LD->getMemOperand()); } // Create token factor to keep old chain connected. SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Chain, ReplLoad.getValue(1)); // Make sure the new and old chains are cleaned up. AddToWorklist(Token.getNode()); // Replace uses with load result and token factor. Don't add users // to work list. return CombineTo(N, ReplLoad.getValue(0), Token, false); } } // Try transforming N to an indexed load. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); // Try to slice up N to more direct loads if the slices are mapped to // different register banks or pairing can take place. if (SliceUpLoad(N)) return SDValue(N, 0); return SDValue(); } namespace { /// \brief Helper structure used to slice a load in smaller loads. /// Basically a slice is obtained from the following sequence: /// Origin = load Ty1, Base /// Shift = srl Ty1 Origin, CstTy Amount /// Inst = trunc Shift to Ty2 /// /// Then, it will be rewriten into: /// Slice = load SliceTy, Base + SliceOffset /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 /// /// SliceTy is deduced from the number of bits that are actually used to /// build Inst. struct LoadedSlice { /// \brief Helper structure used to compute the cost of a slice. struct Cost { /// Are we optimizing for code size. bool ForCodeSize; /// Various cost. unsigned Loads; unsigned Truncates; unsigned CrossRegisterBanksCopies; unsigned ZExts; unsigned Shift; Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize), Loads(0), Truncates(0), CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {} /// \brief Get the cost of one isolated slice. Cost(const LoadedSlice &LS, bool ForCodeSize = false) : ForCodeSize(ForCodeSize), Loads(1), Truncates(0), CrossRegisterBanksCopies(0), ZExts(0), Shift(0) { EVT TruncType = LS.Inst->getValueType(0); EVT LoadedType = LS.getLoadedType(); if (TruncType != LoadedType && !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) ZExts = 1; } /// \brief Account for slicing gain in the current cost. /// Slicing provide a few gains like removing a shift or a /// truncate. This method allows to grow the cost of the original /// load with the gain from this slice. void addSliceGain(const LoadedSlice &LS) { // Each slice saves a truncate. const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); if (!TLI.isTruncateFree(LS.Inst->getValueType(0), LS.Inst->getOperand(0).getValueType())) ++Truncates; // If there is a shift amount, this slice gets rid of it. if (LS.Shift) ++Shift; // If this slice can merge a cross register bank copy, account for it. if (LS.canMergeExpensiveCrossRegisterBankCopy()) ++CrossRegisterBanksCopies; } Cost &operator+=(const Cost &RHS) { Loads += RHS.Loads; Truncates += RHS.Truncates; CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; ZExts += RHS.ZExts; Shift += RHS.Shift; return *this; } bool operator==(const Cost &RHS) const { return Loads == RHS.Loads && Truncates == RHS.Truncates && CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && ZExts == RHS.ZExts && Shift == RHS.Shift; } bool operator!=(const Cost &RHS) const { return !(*this == RHS); } bool operator<(const Cost &RHS) const { // Assume cross register banks copies are as expensive as loads. // FIXME: Do we want some more target hooks? unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; // Unless we are optimizing for code size, consider the // expensive operation first. if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) return ExpensiveOpsLHS < ExpensiveOpsRHS; return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); } bool operator>(const Cost &RHS) const { return RHS < *this; } bool operator<=(const Cost &RHS) const { return !(RHS < *this); } bool operator>=(const Cost &RHS) const { return !(*this < RHS); } }; // The last instruction that represent the slice. This should be a // truncate instruction. SDNode *Inst; // The original load instruction. LoadSDNode *Origin; // The right shift amount in bits from the original load. unsigned Shift; // The DAG from which Origin came from. // This is used to get some contextual information about legal types, etc. SelectionDAG *DAG; LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr, unsigned Shift = 0, SelectionDAG *DAG = nullptr) : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} LoadedSlice(const LoadedSlice &LS) : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {} /// \brief Get the bits used in a chunk of bits \p BitWidth large. /// \return Result is \p BitWidth and has used bits set to 1 and /// not used bits set to 0. APInt getUsedBits() const { // Reproduce the trunc(lshr) sequence: // - Start from the truncated value. // - Zero extend to the desired bit width. // - Shift left. assert(Origin && "No original load to compare against."); unsigned BitWidth = Origin->getValueSizeInBits(0); assert(Inst && "This slice is not bound to an instruction"); assert(Inst->getValueSizeInBits(0) <= BitWidth && "Extracted slice is bigger than the whole type!"); APInt UsedBits(Inst->getValueSizeInBits(0), 0); UsedBits.setAllBits(); UsedBits = UsedBits.zext(BitWidth); UsedBits <<= Shift; return UsedBits; } /// \brief Get the size of the slice to be loaded in bytes. unsigned getLoadedSize() const { unsigned SliceSize = getUsedBits().countPopulation(); assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); return SliceSize / 8; } /// \brief Get the type that will be loaded for this slice. /// Note: This may not be the final type for the slice. EVT getLoadedType() const { assert(DAG && "Missing context"); LLVMContext &Ctxt = *DAG->getContext(); return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); } /// \brief Get the alignment of the load used for this slice. unsigned getAlignment() const { unsigned Alignment = Origin->getAlignment(); unsigned Offset = getOffsetFromBase(); if (Offset != 0) Alignment = MinAlign(Alignment, Alignment + Offset); return Alignment; } /// \brief Check if this slice can be rewritten with legal operations. bool isLegal() const { // An invalid slice is not legal. if (!Origin || !Inst || !DAG) return false; // Offsets are for indexed load only, we do not handle that. if (Origin->getOffset().getOpcode() != ISD::UNDEF) return false; const TargetLowering &TLI = DAG->getTargetLoweringInfo(); // Check that the type is legal. EVT SliceType = getLoadedType(); if (!TLI.isTypeLegal(SliceType)) return false; // Check that the load is legal for this type. if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) return false; // Check that the offset can be computed. // 1. Check its type. EVT PtrType = Origin->getBasePtr().getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) return false; // 2. Check that it fits in the immediate. if (!TLI.isLegalAddImmediate(getOffsetFromBase())) return false; // 3. Check that the computation is legal. if (!TLI.isOperationLegal(ISD::ADD, PtrType)) return false; // Check that the zext is legal if it needs one. EVT TruncateType = Inst->getValueType(0); if (TruncateType != SliceType && !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) return false; return true; } /// \brief Get the offset in bytes of this slice in the original chunk of /// bits. /// \pre DAG != nullptr. uint64_t getOffsetFromBase() const { assert(DAG && "Missing context."); bool IsBigEndian = DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian(); assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); uint64_t Offset = Shift / 8; unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; assert(!(Origin->getValueSizeInBits(0) & 0x7) && "The size of the original loaded type is not a multiple of a" " byte."); // If Offset is bigger than TySizeInBytes, it means we are loading all // zeros. This should have been optimized before in the process. assert(TySizeInBytes > Offset && "Invalid shift amount for given loaded size"); if (IsBigEndian) Offset = TySizeInBytes - Offset - getLoadedSize(); return Offset; } /// \brief Generate the sequence of instructions to load the slice /// represented by this object and redirect the uses of this slice to /// this new sequence of instructions. /// \pre this->Inst && this->Origin are valid Instructions and this /// object passed the legal check: LoadedSlice::isLegal returned true. /// \return The last instruction of the sequence used to load the slice. SDValue loadSlice() const { assert(Inst && Origin && "Unable to replace a non-existing slice."); const SDValue &OldBaseAddr = Origin->getBasePtr(); SDValue BaseAddr = OldBaseAddr; // Get the offset in that chunk of bytes w.r.t. the endianess. int64_t Offset = static_cast(getOffsetFromBase()); assert(Offset >= 0 && "Offset too big to fit in int64_t!"); if (Offset) { // BaseAddr = BaseAddr + Offset. EVT ArithType = BaseAddr.getValueType(); BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr, DAG->getConstant(Offset, ArithType)); } // Create the type of the loaded slice according to its size. EVT SliceType = getLoadedType(); // Create the load for the slice. SDValue LastInst = DAG->getLoad( SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(), Origin->isNonTemporal(), Origin->isInvariant(), getAlignment()); // If the final type is not the same as the loaded type, this means that // we have to pad with zero. Create a zero extend for that. EVT FinalType = Inst->getValueType(0); if (SliceType != FinalType) LastInst = DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); return LastInst; } /// \brief Check if this slice can be merged with an expensive cross register /// bank copy. E.g., /// i = load i32 /// f = bitcast i32 i to float bool canMergeExpensiveCrossRegisterBankCopy() const { if (!Inst || !Inst->hasOneUse()) return false; SDNode *Use = *Inst->use_begin(); if (Use->getOpcode() != ISD::BITCAST) return false; assert(DAG && "Missing context"); const TargetLowering &TLI = DAG->getTargetLoweringInfo(); EVT ResVT = Use->getValueType(0); const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT()); const TargetRegisterClass *ArgRC = TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT()); if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; // At this point, we know that we perform a cross-register-bank copy. // Check if it is expensive. const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); // Assume bitcasts are cheap, unless both register classes do not // explicitly share a common sub class. if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) return false; // Check if it will be merged with the load. // 1. Check the alignment constraint. unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment( ResVT.getTypeForEVT(*DAG->getContext())); if (RequiredAlignment > getAlignment()) return false; // 2. Check that the load is a legal operation for that type. if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; // 3. Check that we do not have a zext in the way. if (Inst->getValueType(0) != getLoadedType()) return false; return true; } }; } /// \brief Check that all bits set in \p UsedBits form a dense region, i.e., /// \p UsedBits looks like 0..0 1..1 0..0. static bool areUsedBitsDense(const APInt &UsedBits) { // If all the bits are one, this is dense! if (UsedBits.isAllOnesValue()) return true; // Get rid of the unused bits on the right. APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); // Get rid of the unused bits on the left. if (NarrowedUsedBits.countLeadingZeros()) NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); // Check that the chunk of bits is completely used. return NarrowedUsedBits.isAllOnesValue(); } /// \brief Check whether or not \p First and \p Second are next to each other /// in memory. This means that there is no hole between the bits loaded /// by \p First and the bits loaded by \p Second. static bool areSlicesNextToEachOther(const LoadedSlice &First, const LoadedSlice &Second) { assert(First.Origin == Second.Origin && First.Origin && "Unable to match different memory origins."); APInt UsedBits = First.getUsedBits(); assert((UsedBits & Second.getUsedBits()) == 0 && "Slices are not supposed to overlap."); UsedBits |= Second.getUsedBits(); return areUsedBitsDense(UsedBits); } /// \brief Adjust the \p GlobalLSCost according to the target /// paring capabilities and the layout of the slices. /// \pre \p GlobalLSCost should account for at least as many loads as /// there is in the slices in \p LoadedSlices. static void adjustCostForPairing(SmallVectorImpl &LoadedSlices, LoadedSlice::Cost &GlobalLSCost) { unsigned NumberOfSlices = LoadedSlices.size(); // If there is less than 2 elements, no pairing is possible. if (NumberOfSlices < 2) return; // Sort the slices so that elements that are likely to be next to each // other in memory are next to each other in the list. std::sort(LoadedSlices.begin(), LoadedSlices.end(), [](const LoadedSlice &LHS, const LoadedSlice &RHS) { assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); }); const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); // First (resp. Second) is the first (resp. Second) potentially candidate // to be placed in a paired load. const LoadedSlice *First = nullptr; const LoadedSlice *Second = nullptr; for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, // Set the beginning of the pair. First = Second) { Second = &LoadedSlices[CurrSlice]; // If First is NULL, it means we start a new pair. // Get to the next slice. if (!First) continue; EVT LoadedType = First->getLoadedType(); // If the types of the slices are different, we cannot pair them. if (LoadedType != Second->getLoadedType()) continue; // Check if the target supplies paired loads for this type. unsigned RequiredAlignment = 0; if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { // move to the next pair, this type is hopeless. Second = nullptr; continue; } // Check if we meet the alignment requirement. if (RequiredAlignment > First->getAlignment()) continue; // Check that both loads are next to each other in memory. if (!areSlicesNextToEachOther(*First, *Second)) continue; assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); --GlobalLSCost.Loads; // Move to the next pair. Second = nullptr; } } /// \brief Check the profitability of all involved LoadedSlice. /// Currently, it is considered profitable if there is exactly two /// involved slices (1) which are (2) next to each other in memory, and /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). /// /// Note: The order of the elements in \p LoadedSlices may be modified, but not /// the elements themselves. /// /// FIXME: When the cost model will be mature enough, we can relax /// constraints (1) and (2). static bool isSlicingProfitable(SmallVectorImpl &LoadedSlices, const APInt &UsedBits, bool ForCodeSize) { unsigned NumberOfSlices = LoadedSlices.size(); if (StressLoadSlicing) return NumberOfSlices > 1; // Check (1). if (NumberOfSlices != 2) return false; // Check (2). if (!areUsedBitsDense(UsedBits)) return false; // Check (3). LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); // The original code has one big load. OrigCost.Loads = 1; for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { const LoadedSlice &LS = LoadedSlices[CurrSlice]; // Accumulate the cost of all the slices. LoadedSlice::Cost SliceCost(LS, ForCodeSize); GlobalSlicingCost += SliceCost; // Account as cost in the original configuration the gain obtained // with the current slices. OrigCost.addSliceGain(LS); } // If the target supports paired load, adjust the cost accordingly. adjustCostForPairing(LoadedSlices, GlobalSlicingCost); return OrigCost > GlobalSlicingCost; } /// \brief If the given load, \p LI, is used only by trunc or trunc(lshr) /// operations, split it in the various pieces being extracted. /// /// This sort of thing is introduced by SROA. /// This slicing takes care not to insert overlapping loads. /// \pre LI is a simple load (i.e., not an atomic or volatile load). bool DAGCombiner::SliceUpLoad(SDNode *N) { if (Level < AfterLegalizeDAG) return false; LoadSDNode *LD = cast(N); if (LD->isVolatile() || !ISD::isNormalLoad(LD) || !LD->getValueType(0).isInteger()) return false; // Keep track of already used bits to detect overlapping values. // In that case, we will just abort the transformation. APInt UsedBits(LD->getValueSizeInBits(0), 0); SmallVector LoadedSlices; // Check if this load is used as several smaller chunks of bits. // Basically, look for uses in trunc or trunc(lshr) and record a new chain // of computation for each trunc. for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); UI != UIEnd; ++UI) { // Skip the uses of the chain. if (UI.getUse().getResNo() != 0) continue; SDNode *User = *UI; unsigned Shift = 0; // Check if this is a trunc(lshr). if (User->getOpcode() == ISD::SRL && User->hasOneUse() && isa(User->getOperand(1))) { Shift = cast(User->getOperand(1))->getZExtValue(); User = *User->use_begin(); } // At this point, User is a Truncate, iff we encountered, trunc or // trunc(lshr). if (User->getOpcode() != ISD::TRUNCATE) return false; // The width of the type must be a power of 2 and greater than 8-bits. // Otherwise the load cannot be represented in LLVM IR. // Moreover, if we shifted with a non-8-bits multiple, the slice // will be across several bytes. We do not support that. unsigned Width = User->getValueSizeInBits(0); if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) return 0; // Build the slice for this chain of computations. LoadedSlice LS(User, LD, Shift, &DAG); APInt CurrentUsedBits = LS.getUsedBits(); // Check if this slice overlaps with another. if ((CurrentUsedBits & UsedBits) != 0) return false; // Update the bits used globally. UsedBits |= CurrentUsedBits; // Check if the new slice would be legal. if (!LS.isLegal()) return false; // Record the slice. LoadedSlices.push_back(LS); } // Abort slicing if it does not seem to be profitable. if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) return false; ++SlicedLoads; // Rewrite each chain to use an independent load. // By construction, each chain can be represented by a unique load. // Prepare the argument for the new token factor for all the slices. SmallVector ArgChains; for (SmallVectorImpl::const_iterator LSIt = LoadedSlices.begin(), LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) { SDValue SliceInst = LSIt->loadSlice(); CombineTo(LSIt->Inst, SliceInst, true); if (SliceInst.getNode()->getOpcode() != ISD::LOAD) SliceInst = SliceInst.getOperand(0); assert(SliceInst->getOpcode() == ISD::LOAD && "It takes more than a zext to get to the loaded slice!!"); ArgChains.push_back(SliceInst.getValue(1)); } SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, ArgChains); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); return true; } /// Check to see if V is (and load (ptr), imm), where the load is having /// specific bytes cleared out. If so, return the byte size being masked out /// and the shift amount. static std::pair CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { std::pair Result(0, 0); // Check for the structure we're looking for. if (V->getOpcode() != ISD::AND || !isa(V->getOperand(1)) || !ISD::isNormalLoad(V->getOperand(0).getNode())) return Result; // Check the chain and pointer. LoadSDNode *LD = cast(V->getOperand(0)); if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. // The store should be chained directly to the load or be an operand of a // tokenfactor. if (LD == Chain.getNode()) ; // ok. else if (Chain->getOpcode() != ISD::TokenFactor) return Result; // Fail. else { bool isOk = false; for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) if (Chain->getOperand(i).getNode() == LD) { isOk = true; break; } if (!isOk) return Result; } // This only handles simple types. if (V.getValueType() != MVT::i16 && V.getValueType() != MVT::i32 && V.getValueType() != MVT::i64) return Result; // Check the constant mask. Invert it so that the bits being masked out are // 0 and the bits being kept are 1. Use getSExtValue so that leading bits // follow the sign bit for uniformity. uint64_t NotMask = ~cast(V->getOperand(1))->getSExtValue(); unsigned NotMaskLZ = countLeadingZeros(NotMask); if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. unsigned NotMaskTZ = countTrailingZeros(NotMask); if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. if (NotMaskLZ == 64) return Result; // All zero mask. // See if we have a continuous run of bits. If so, we have 0*1+0* if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64) return Result; // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. if (V.getValueType() != MVT::i64 && NotMaskLZ) NotMaskLZ -= 64-V.getValueSizeInBits(); unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; switch (MaskedBytes) { case 1: case 2: case 4: break; default: return Result; // All one mask, or 5-byte mask. } // Verify that the first bit starts at a multiple of mask so that the access // is aligned the same as the access width. if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; Result.first = MaskedBytes; Result.second = NotMaskTZ/8; return Result; } /// Check to see if IVal is something that provides a value as specified by /// MaskInfo. If so, replace the specified store with a narrower store of /// truncated IVal. static SDNode * ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, SDValue IVal, StoreSDNode *St, DAGCombiner *DC) { unsigned NumBytes = MaskInfo.first; unsigned ByteShift = MaskInfo.second; SelectionDAG &DAG = DC->getDAG(); // Check to see if IVal is all zeros in the part being masked in by the 'or' // that uses this. If not, this is not a replacement. APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), ByteShift*8, (ByteShift+NumBytes)*8); if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr; // Check that it is legal on the target to do this. It is legal if the new // VT we're shrinking to (i8/i16/i32) is legal or we're still before type // legalization. MVT VT = MVT::getIntegerVT(NumBytes*8); if (!DC->isTypeLegal(VT)) return nullptr; // Okay, we can do this! Replace the 'St' store with a store of IVal that is // shifted by ByteShift and truncated down to NumBytes. if (ByteShift) IVal = DAG.getNode(ISD::SRL, SDLoc(IVal), IVal.getValueType(), IVal, DAG.getConstant(ByteShift*8, DC->getShiftAmountTy(IVal.getValueType()))); // Figure out the offset for the store and the alignment of the access. unsigned StOffset; unsigned NewAlign = St->getAlignment(); if (DAG.getTargetLoweringInfo().isLittleEndian()) StOffset = ByteShift; else StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; SDValue Ptr = St->getBasePtr(); if (StOffset) { Ptr = DAG.getNode(ISD::ADD, SDLoc(IVal), Ptr.getValueType(), Ptr, DAG.getConstant(StOffset, Ptr.getValueType())); NewAlign = MinAlign(NewAlign, StOffset); } // Truncate down to the new size. IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); ++OpsNarrowed; return DAG.getStore(St->getChain(), SDLoc(St), IVal, Ptr, St->getPointerInfo().getWithOffset(StOffset), false, false, NewAlign).getNode(); } /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try /// narrowing the load and store if it would end up being a win for performance /// or code size. SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { StoreSDNode *ST = cast(N); if (ST->isVolatile()) return SDValue(); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); EVT VT = Value.getValueType(); if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) return SDValue(); unsigned Opc = Value.getOpcode(); // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst // is a byte mask indicating a consecutive number of bytes, check to see if // Y is known to provide just those bytes. If so, we try to replace the // load + replace + store sequence with a single (narrower) store, which makes // the load dead. if (Opc == ISD::OR) { std::pair MaskedLoad; MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); if (MaskedLoad.first) if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, Value.getOperand(1), ST,this)) return SDValue(NewST, 0); // Or is commutative, so try swapping X and Y. MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); if (MaskedLoad.first) if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, Value.getOperand(0), ST,this)) return SDValue(NewST, 0); } if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || Value.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N0 = Value.getOperand(0); if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Chain == SDValue(N0.getNode(), 1)) { LoadSDNode *LD = cast(N0); if (LD->getBasePtr() != Ptr || LD->getPointerInfo().getAddrSpace() != ST->getPointerInfo().getAddrSpace()) return SDValue(); // Find the type to narrow it the load / op / store to. SDValue N1 = Value.getOperand(1); unsigned BitWidth = N1.getValueSizeInBits(); APInt Imm = cast(N1)->getAPIntValue(); if (Opc == ISD::AND) Imm ^= APInt::getAllOnesValue(BitWidth); if (Imm == 0 || Imm.isAllOnesValue()) return SDValue(); unsigned ShAmt = Imm.countTrailingZeros(); unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; unsigned NewBW = NextPowerOf2(MSB - ShAmt); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); + // The narrowing should be profitable, the load/store operation should be + // legal (or custom) and the store size should be equal to the NewVT width. while (NewBW < BitWidth && !(TLI.isOperationLegalOrCustom(Opc, NewVT) && TLI.isNarrowingProfitable(VT, NewVT))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } if (NewBW >= BitWidth) return SDValue(); // If the lsb changed does not start at the type bitwidth boundary, // start at the previous one. if (ShAmt % NewBW) ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, std::min(BitWidth, ShAmt + NewBW)); if ((Imm & Mask) == Imm) { APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); if (Opc == ISD::AND) NewImm ^= APInt::getAllOnesValue(NewBW); uint64_t PtrOff = ShAmt / 8; // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (TLI.isBigEndian()) PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff); Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); if (NewAlign < TLI.getDataLayout()->getABITypeAlignment(NewVTTy)) return SDValue(); SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD), Ptr.getValueType(), Ptr, DAG.getConstant(PtrOff, Ptr.getValueType())); SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, LD->getPointerInfo().getWithOffset(PtrOff), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), NewAlign, LD->getAAInfo()); SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, DAG.getConstant(NewImm, NewVT)); SDValue NewST = DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, ST->getPointerInfo().getWithOffset(PtrOff), false, false, NewAlign); AddToWorklist(NewPtr.getNode()); AddToWorklist(NewLD.getNode()); AddToWorklist(NewVal.getNode()); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); ++OpsNarrowed; return NewST; } } return SDValue(); } /// For a given floating point load / store pair, if the load value isn't used /// by any other operations, then consider transforming the pair to integer /// load / store operations if the target deems the transformation profitable. SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && Value.hasOneUse() && Chain == SDValue(Value.getNode(), 1)) { LoadSDNode *LD = cast(Value); EVT VT = LD->getMemoryVT(); if (!VT.isFloatingPoint() || VT != ST->getMemoryVT() || LD->isNonTemporal() || ST->isNonTemporal() || LD->getPointerInfo().getAddrSpace() != 0 || ST->getPointerInfo().getAddrSpace() != 0) return SDValue(); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); if (!TLI.isOperationLegal(ISD::LOAD, IntVT) || !TLI.isOperationLegal(ISD::STORE, IntVT) || !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) || !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT)) return SDValue(); unsigned LDAlign = LD->getAlignment(); unsigned STAlign = ST->getAlignment(); Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlign = TLI.getDataLayout()->getABITypeAlignment(IntVTTy); if (LDAlign < ABIAlign || STAlign < ABIAlign) return SDValue(); SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), false, false, false, LDAlign); SDValue NewST = DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(), ST->getPointerInfo(), false, false, STAlign); AddToWorklist(NewLD.getNode()); AddToWorklist(NewST.getNode()); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); ++LdStFP2Int; return NewST; } return SDValue(); } /// Helper struct to parse and store a memory address as base + index + offset. /// We ignore sign extensions when it is safe to do so. /// The following two expressions are not equivalent. To differentiate we need /// to store whether there was a sign extension involved in the index /// computation. /// (load (i64 add (i64 copyfromreg %c) /// (i64 signextend (add (i8 load %index) /// (i8 1)))) /// vs /// /// (load (i64 add (i64 copyfromreg %c) /// (i64 signextend (i32 add (i32 signextend (i8 load %index)) /// (i32 1))))) struct BaseIndexOffset { SDValue Base; SDValue Index; int64_t Offset; bool IsIndexSignExt; BaseIndexOffset() : Offset(0), IsIndexSignExt(false) {} BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset, bool IsIndexSignExt) : Base(Base), Index(Index), Offset(Offset), IsIndexSignExt(IsIndexSignExt) {} bool equalBaseIndex(const BaseIndexOffset &Other) { return Other.Base == Base && Other.Index == Index && Other.IsIndexSignExt == IsIndexSignExt; } /// Parses tree in Ptr for base, index, offset addresses. static BaseIndexOffset match(SDValue Ptr) { bool IsIndexSignExt = false; // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD // instruction, then it could be just the BASE or everything else we don't // know how to handle. Just use Ptr as BASE and give up. if (Ptr->getOpcode() != ISD::ADD) return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt); // We know that we have at least an ADD instruction. Try to pattern match // the simple case of BASE + OFFSET. if (isa(Ptr->getOperand(1))) { int64_t Offset = cast(Ptr->getOperand(1))->getSExtValue(); return BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset, IsIndexSignExt); } // Inside a loop the current BASE pointer is calculated using an ADD and a // MUL instruction. In this case Ptr is the actual BASE pointer. // (i64 add (i64 %array_ptr) // (i64 mul (i64 %induction_var) // (i64 %element_size))) if (Ptr->getOperand(1)->getOpcode() == ISD::MUL) return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt); // Look at Base + Index + Offset cases. SDValue Base = Ptr->getOperand(0); SDValue IndexOffset = Ptr->getOperand(1); // Skip signextends. if (IndexOffset->getOpcode() == ISD::SIGN_EXTEND) { IndexOffset = IndexOffset->getOperand(0); IsIndexSignExt = true; } // Either the case of Base + Index (no offset) or something else. if (IndexOffset->getOpcode() != ISD::ADD) return BaseIndexOffset(Base, IndexOffset, 0, IsIndexSignExt); // Now we have the case of Base + Index + offset. SDValue Index = IndexOffset->getOperand(0); SDValue Offset = IndexOffset->getOperand(1); if (!isa(Offset)) return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt); // Ignore signextends. if (Index->getOpcode() == ISD::SIGN_EXTEND) { Index = Index->getOperand(0); IsIndexSignExt = true; } else IsIndexSignExt = false; int64_t Off = cast(Offset)->getSExtValue(); return BaseIndexOffset(Base, Index, Off, IsIndexSignExt); } }; /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } // Ptr to the mem node. LSBaseSDNode *MemNode; // Offset from the base ptr. int64_t OffsetFromBase; // What is the sequence number of this mem node. // Lowest mem operand in the DAG starts at zero. unsigned SequenceNum; }; bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getSizeInBits()/8; bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); // Don't merge vectors into wider inputs. if (MemVT.isVector() || !MemVT.isSimple()) return false; // Perform an early exit check. Do not bother looking at stored values that // are not constants or loads. SDValue StoredVal = St->getValue(); bool IsLoadSrc = isa(StoredVal); if (!isa(StoredVal) && !isa(StoredVal) && !IsLoadSrc) return false; // Only look at ends of store sequences. SDValue Chain = SDValue(St, 0); if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) return false; // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr()); // We must have a base and an offset. if (!BasePtr.Base.getNode()) return false; // Do not handle stores to undef base pointers. if (BasePtr.Base.getOpcode() == ISD::UNDEF) return false; // Save the LoadSDNodes that we find in the chain. // We need to make sure that these nodes do not interfere with // any of the store nodes. SmallVector AliasLoadNodes; // Save the StoreSDNodes that we find in the chain. SmallVector StoreNodes; // Walk up the chain and look for nodes with offsets from the same // base pointer. Stop when reaching an instruction with a different kind // or instruction which has a different base pointer. unsigned Seq = 0; StoreSDNode *Index = St; while (Index) { // If the chain has more than one use, then we can't reorder the mem ops. if (Index != St && !SDValue(Index, 0)->hasOneUse()) break; // Find the base pointer and offset for this memory node. BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr()); // Check that the base pointer is the same as the original one. if (!Ptr.equalBaseIndex(BasePtr)) break; // Check that the alignment is the same. if (Index->getAlignment() != St->getAlignment()) break; // The memory operands must not be volatile. if (Index->isVolatile() || Index->isIndexed()) break; // No truncation. if (StoreSDNode *St = dyn_cast(Index)) if (St->isTruncatingStore()) break; // The stored memory type must be the same. if (Index->getMemoryVT() != MemVT) break; // We do not allow unaligned stores because we want to prevent overriding // stores. if (Index->getAlignment()*8 != MemVT.getSizeInBits()) break; // We found a potential memory operand to merge. StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); // Find the next memory operand in the chain. If the next operand in the // chain is a store then move up and continue the scan with the next // memory operand. If the next operand is a load save it and use alias // information to check if it interferes with anything. SDNode *NextInChain = Index->getChain().getNode(); while (1) { if (StoreSDNode *STn = dyn_cast(NextInChain)) { // We found a store node. Use it for the next iteration. Index = STn; break; } else if (LoadSDNode *Ldn = dyn_cast(NextInChain)) { if (Ldn->isVolatile()) { Index = nullptr; break; } // Save the load node for later. Continue the scan. AliasLoadNodes.push_back(Ldn); NextInChain = Ldn->getChain().getNode(); continue; } else { Index = nullptr; break; } } } // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; // Sort the memory operands according to their distance from the base pointer. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { return LHS.OffsetFromBase < RHS.OffsetFromBase || (LHS.OffsetFromBase == RHS.OffsetFromBase && LHS.SequenceNum > RHS.SequenceNum); }); // Scan the memory operations on the chain and find the first non-consecutive // store memory address. unsigned LastConsecutiveStore = 0; int64_t StartAddress = StoreNodes[0].OffsetFromBase; for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) { // Check that the addresses are consecutive starting from the second // element in the list of stores. if (i > 0) { int64_t CurrAddress = StoreNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; } bool Alias = false; // Check if this store interferes with any of the loads that we found. for (unsigned ld = 0, lde = AliasLoadNodes.size(); ld < lde; ++ld) if (isAlias(AliasLoadNodes[ld], StoreNodes[i].MemNode)) { Alias = true; break; } // We found a load that alias with this store. Stop the sequence. if (Alias) break; // Mark this node as useful. LastConsecutiveStore = i; } // The node with the lowest store address. LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; // Store the constants into memory as one consecutive store. if (!IsLoadSrc) { unsigned LastLegalType = 0; unsigned LastLegalVectorType = 0; bool NonZero = false; for (unsigned i=0; i(StoreNodes[i].MemNode); SDValue StoredVal = St->getValue(); if (ConstantSDNode *C = dyn_cast(StoredVal)) { NonZero |= !C->isNullValue(); } else if (ConstantFPSDNode *C = dyn_cast(StoredVal)) { NonZero |= !C->getConstantFPValue()->isNullValue(); } else { // Non-constant. break; } // Find a legal type for the constant store. unsigned StoreBW = (i+1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); if (TLI.isTypeLegal(StoreTy)) LastLegalType = i+1; // Or check whether a truncstore is legal. else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy)) LastLegalType = i+1; } // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); if (TLI.isTypeLegal(Ty)) LastLegalVectorType = i + 1; } // We only use vectors if the constant is known to be zero and the // function is not marked with the noimplicitfloat attribute. if (NonZero || NoVectors) LastLegalVectorType = 0; // Check if we found a legal integer type to store. if (LastLegalType == 0 && LastLegalVectorType == 0) return false; bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; // Make sure we have something to merge. if (NumElem < 2) return false; unsigned EarliestNodeUsed = 0; for (unsigned i=0; i < NumElem; ++i) { // Find a chain for the new wide-store operand. Notice that some // of the store nodes that we found may not be selected for inclusion // in the wide store. The chain we use needs to be the chain of the // earliest store node which is *used* and replaced by the wide store. if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum) EarliestNodeUsed = i; } // The earliest Node in the DAG. LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); SDValue StoredVal; if (UseVector) { // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); StoredVal = DAG.getConstant(0, Ty); } else { unsigned StoreBW = NumElem * ElementSizeBytes * 8; APInt StoreInt(StoreBW, 0); // Construct a single integer constant which is made of the smaller // constant inputs. bool IsLE = TLI.isLittleEndian(); for (unsigned i = 0; i < NumElem ; ++i) { unsigned Idx = IsLE ?(NumElem - 1 - i) : i; StoreSDNode *St = cast(StoreNodes[Idx].MemNode); SDValue Val = St->getValue(); StoreInt<<=ElementSizeBytes*8; if (ConstantSDNode *C = dyn_cast(Val)) { StoreInt|=C->getAPIntValue().zext(StoreBW); } else if (ConstantFPSDNode *C = dyn_cast(Val)) { StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW); } else { llvm_unreachable("Invalid constant element type"); } } // Create the new Load and Store operations. EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); StoredVal = DAG.getConstant(StoreInt, StoreTy); } SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, FirstInChain->getAlignment()); // Replace the first store with the new store CombineTo(EarliestOp, NewStore); // Erase all other stores. for (unsigned i = 0; i < NumElem ; ++i) { if (StoreNodes[i].MemNode == EarliestOp) continue; StoreSDNode *St = cast(StoreNodes[i].MemNode); // ReplaceAllUsesWith will replace all uses that existed when it was // called, but graph optimizations may cause new ones to appear. For // example, the case in pr14333 looks like // // St's chain -> St -> another store -> X // // And the only difference from St to the other store is the chain. // When we change it's chain to be St's chain they become identical, // get CSEed and the net result is that X is now a use of St. // Since we know that St is redundant, just iterate. while (!St->use_empty()) DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); deleteAndRecombine(St); } return true; } // Below we handle the case of multiple consecutive stores that // come from multiple consecutive loads. We merge them into a single // wide load and a single wide store. // Look for load nodes which are used by the stored values. SmallVector LoadNodes; // Find acceptable loads. Loads need to have the same chain (token factor), // must not be zext, volatile, indexed, and they must be consecutive. BaseIndexOffset LdBasePtr; for (unsigned i=0; i(StoreNodes[i].MemNode); LoadSDNode *Ld = dyn_cast(St->getValue()); if (!Ld) break; // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) break; // Check that the alignment is the same as the stores. if (Ld->getAlignment() != St->getAlignment()) break; // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) break; // We do not accept ext loads. if (Ld->getExtensionType() != ISD::NON_EXTLOAD) break; // The stored memory type must be the same. if (Ld->getMemoryVT() != MemVT) break; BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr()); // If this is not the first ptr that we check. if (LdBasePtr.Base.getNode()) { // The base ptr must be the same. if (!LdPtr.equalBaseIndex(LdBasePtr)) break; } else { // Check that all other base pointers are the same as this one. LdBasePtr = LdPtr; } // We found a potential memory operand to merge. LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0)); } if (LoadNodes.size() < 2) return false; // If we have load/store pair instructions and we only have two values, // don't bother. unsigned RequiredAlignment; if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && St->getAlignment() >= RequiredAlignment) return false; // Scan the memory operations on the chain and find the first non-consecutive // load memory address. These variables hold the index in the store node // array. unsigned LastConsecutiveLoad = 0; // This variable refers to the size and not index in the array. unsigned LastLegalVectorType = 0; unsigned LastLegalIntegerType = 0; StartAddress = LoadNodes[0].OffsetFromBase; SDValue FirstChain = LoadNodes[0].MemNode->getChain(); for (unsigned i = 1; i < LoadNodes.size(); ++i) { // All loads much share the same chain. if (LoadNodes[i].MemNode->getChain() != FirstChain) break; int64_t CurrAddress = LoadNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; // Find a legal type for the vector store. EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); if (TLI.isTypeLegal(StoreTy)) LastLegalVectorType = i + 1; // Find a legal type for the integer store. unsigned StoreBW = (i+1) * ElementSizeBytes * 8; StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); if (TLI.isTypeLegal(StoreTy)) LastLegalIntegerType = i + 1; // Or check whether a truncstore and extload is legal. else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy)) LastLegalIntegerType = i+1; } } // Only use vector types if the vector type is larger than the integer type. // If they are the same, use integers. bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType); // We add +1 here because the LastXXX variables refer to location while // the NumElem refers to array/index size. unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1; NumElem = std::min(LastLegalType, NumElem); if (NumElem < 2) return false; // The earliest Node in the DAG. unsigned EarliestNodeUsed = 0; LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; for (unsigned i=1; i StoreNodes[EarliestNodeUsed].SequenceNum) EarliestNodeUsed = i; } // Find if it is better to use vectors or integers to load and store // to memory. EVT JointMemOpVT; if (UseVectorTy) { JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); } else { unsigned StoreBW = NumElem * ElementSizeBytes * 8; JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), StoreBW); } SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), false, false, false, FirstLoad->getAlignment()); SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, FirstInChain->getAlignment()); // Replace one of the loads with the new load. LoadSDNode *Ld = cast(LoadNodes[0].MemNode); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); // Remove the rest of the load chains. for (unsigned i = 1; i < NumElem ; ++i) { // Replace all chain users of the old load nodes with the chain of the new // load node. LoadSDNode *Ld = cast(LoadNodes[i].MemNode); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Ld->getChain()); } // Replace the first store with the new store. CombineTo(EarliestOp, NewStore); // Erase all other stores. for (unsigned i = 0; i < NumElem ; ++i) { // Remove all Store nodes. if (StoreNodes[i].MemNode == EarliestOp) continue; StoreSDNode *St = cast(StoreNodes[i].MemNode); DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); deleteAndRecombine(St); } return true; } SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); // If this is a store of a bit convert, store the input value if the // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && ST->isUnindexed()) { unsigned OrigAlign = ST->getAlignment(); EVT SVT = Value.getOperand(0).getValueType(); unsigned Align = TLI.getDataLayout()-> getABITypeAlignment(SVT.getTypeForEVT(*DAG.getContext())); if (Align <= OrigAlign && ((!LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, SVT))) return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getPointerInfo(), ST->isVolatile(), ST->isNonTemporal(), OrigAlign, ST->getAAInfo()); } // Turn 'store undef, Ptr' -> nothing. if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed()) return Chain; // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' if (ConstantFPSDNode *CFP = dyn_cast(Value)) { // NOTE: If the original store is volatile, this transform must not increase // the number of stores. For example, on x86-32 an f64 can be stored in one // processor operation but an i64 (which is not legal) requires two. So the // transform should not be done in this case. if (Value.getOpcode() != ISD::TargetConstantFP) { SDValue Tmp; switch (CFP->getSimpleValueType(0).SimpleTy) { default: llvm_unreachable("Unknown FP type"); case MVT::f16: // We don't do this for these yet. case MVT::f80: case MVT::f128: case MVT::ppcf128: break; case MVT::f32: if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). bitcastToAPInt().getZExtValue(), MVT::i32); return DAG.getStore(Chain, SDLoc(N), Tmp, Ptr, ST->getMemOperand()); } break; case MVT::f64: if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). getZExtValue(), MVT::i64); return DAG.getStore(Chain, SDLoc(N), Tmp, Ptr, ST->getMemOperand()); } if (!ST->isVolatile() && TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { // Many FP stores are not made apparent until after legalize, e.g. for // argument passing. Since this is so common, custom legalize the // 64-bit integer store into two 32-bit stores. uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, MVT::i32); SDValue Hi = DAG.getConstant(Val >> 32, MVT::i32); if (TLI.isBigEndian()) std::swap(Lo, Hi); unsigned Alignment = ST->getAlignment(); bool isVolatile = ST->isVolatile(); bool isNonTemporal = ST->isNonTemporal(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo, Ptr, ST->getPointerInfo(), isVolatile, isNonTemporal, ST->getAlignment(), AAInfo); Ptr = DAG.getNode(ISD::ADD, SDLoc(N), Ptr.getValueType(), Ptr, DAG.getConstant(4, Ptr.getValueType())); Alignment = MinAlign(Alignment, 4U); SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi, Ptr, ST->getPointerInfo().getWithOffset(4), isVolatile, isNonTemporal, Alignment, AAInfo); return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, St0, St1); } break; } } } // Try to infer better alignment information than the store already has. if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > ST->getAlignment()) return DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(), ST->getMemoryVT(), ST->isVolatile(), ST->isNonTemporal(), Align, ST->getAAInfo()); } } // Try transforming a pair floating point load / store ops to integer // load / store ops. SDValue NewST = TransformFPLoadStorePair(N); if (NewST.getNode()) return NewST; bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA : DAG.getSubtarget().useAA(); #ifndef NDEBUG if (CombinerAAOnlyFunc.getNumOccurrences() && CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif if (UseAA && ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); // If there is a better chain. if (Chain != BetterChain) { SDValue ReplStore; // Replace the chain to avoid dependency. if (ST->isTruncatingStore()) { ReplStore = DAG.getTruncStore(BetterChain, SDLoc(N), Value, Ptr, ST->getMemoryVT(), ST->getMemOperand()); } else { ReplStore = DAG.getStore(BetterChain, SDLoc(N), Value, Ptr, ST->getMemOperand()); } // Create token to keep both nodes around. SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Chain, ReplStore); // Make sure the new and old chains are cleaned up. AddToWorklist(Token.getNode()); // Don't add users to work list. return CombineTo(N, Token, false); } } // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); // FIXME: is there such a thing as a truncating indexed store? if (ST->isTruncatingStore() && ST->isUnindexed() && Value.getValueType().isInteger()) { // See if we can simplify the input to this truncstore with knowledge that // only the low bits are being used. For example: // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" SDValue Shorter = GetDemandedBits(Value, APInt::getLowBitsSet( Value.getValueType().getScalarType().getSizeInBits(), ST->getMemoryVT().getScalarType().getSizeInBits())); AddToWorklist(Value.getNode()); if (Shorter.getNode()) return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), ST->getMemOperand()); // Otherwise, see if we can simplify the operation with // SimplifyDemandedBits, which only works if the value has a single use. if (SimplifyDemandedBits(Value, APInt::getLowBitsSet( Value.getValueType().getScalarType().getSizeInBits(), ST->getMemoryVT().getScalarType().getSizeInBits()))) return SDValue(N, 0); } // If this is a load followed by a store to the same location, then the store // is dead/noop. if (LoadSDNode *Ld = dyn_cast(Value)) { if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && ST->isUnindexed() && !ST->isVolatile() && // There can't be any side effects between the load and store, such as // a call or store. Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) { // The store is dead, remove it. return Chain; } } // If this is a store followed by a store with the same value to the same // location, then the store is dead/noop. if (StoreSDNode *ST1 = dyn_cast(Chain)) { if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() && ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && !ST1->isVolatile()) { // The store is dead, remove it. return Chain; } } // If this is an FP_ROUND or TRUNC followed by a store, fold this into a // truncating store. We can do this even if this is already a truncstore. if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() && ST->isUnindexed() && TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), ST->getMemoryVT())) { return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); } // Only perform this optimization before the types are legal, because we // don't want to perform this optimization on every DAGCombine invocation. if (!LegalTypes) { bool EverChanged = false; do { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. bool Changed = MergeConsecutiveStores(ST); EverChanged |= Changed; if (!Changed) break; } while (ST->getOpcode() != ISD::DELETED_NODE); if (EverChanged) return SDValue(N, 0); } return ReduceLoadOpStoreWidth(N); } SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); SDValue EltNo = N->getOperand(2); SDLoc dl(N); // If the inserted element is an UNDEF, just use the input vector. if (InVal.getOpcode() == ISD::UNDEF) return InVec; EVT VT = InVec.getValueType(); // If we can't generate a legal BUILD_VECTOR, exit if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return SDValue(); // Check that we know which element is being inserted if (!isa(EltNo)) return SDValue(); unsigned Elt = cast(EltNo)->getZExtValue(); // Canonicalize insert_vector_elt dag nodes. // Example: // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1) // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0) // // Do this only if the child insert_vector node has one use; also // do this only if indices are both constants and Idx1 < Idx0. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa(InVec.getOperand(2))) { unsigned OtherElt = cast(InVec.getOperand(2))->getZExtValue(); if (Elt < OtherElt) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VT, InVec.getOperand(0), InVal, EltNo); AddToWorklist(NewOp.getNode()); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); } } // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. SmallVector Ops; // Do not combine these two vectors if the output vector will not replace // the input vector. if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); } else if (InVec.getOpcode() == ISD::UNDEF) { unsigned NElts = VT.getVectorNumElements(); Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); } else { return SDValue(); } // Insert the element if (Elt < Ops.size()) { // All the operands of BUILD_VECTOR must have the same type; // we enforce that here. EVT OpVT = Ops[0].getValueType(); if (InVal.getValueType() != OpVT) InVal = OpVT.bitsGT(InVal.getValueType()) ? DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); Ops[Elt] = InVal; } // Return the new vector return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) { EVT ResultVT = EVE->getValueType(0); EVT VecEltVT = InVecVT.getVectorElementType(); unsigned Align = OriginalLoad->getAlignment(); unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( VecEltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); Align = NewAlign; SDValue NewPtr = OriginalLoad->getBasePtr(); SDValue Offset; EVT PtrType = NewPtr.getValueType(); MachinePointerInfo MPI; if (auto *ConstEltNo = dyn_cast(EltNo)) { int Elt = ConstEltNo->getZExtValue(); unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; if (TLI.isBigEndian()) PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff; Offset = DAG.getConstant(PtrOff, PtrType); MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); } else { Offset = DAG.getNode( ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo, DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType())); if (TLI.isBigEndian()) Offset = DAG.getNode( ISD::SUB, SDLoc(EVE), EltNo.getValueType(), DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()), Offset); MPI = OriginalLoad->getPointerInfo(); } NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset); // The replacement we need to do here is a little tricky: we need to // replace an extractelement of a load with a load. // Use ReplaceAllUsesOfValuesWith to do the replacement. // Note that this replacement assumes that the extractvalue is the only // use of the load; that's okay because we don't want to perform this // transformation in other cases anyway. SDValue Load; SDValue Chain; if (ResultVT.bitsGT(VecEltVT)) { // If the result type of vextract is wider than the load, then issue an // extending load instead. ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD; Load = DAG.getExtLoad( ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), NewPtr, MPI, VecEltVT, OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(), OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo()); Chain = Load.getValue(1); } else { Load = DAG.getLoad( VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(), OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo()); Chain = Load.getValue(1); if (ResultVT.bitsLT(VecEltVT)) Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); else Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load); } WorklistRemover DeadNodes(*this); SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; SDValue To[] = { Load, Chain }; DAG.ReplaceAllUsesOfValuesWith(From, To, 2); // Since we're explicitly calling ReplaceAllUses, add the new node to the // worklist explicitly as well. AddToWorklist(Load.getNode()); AddUsersToWorklist(Load.getNode()); // Add users too // Make sure to revisit this node to clean it up; it will usually be dead. AddToWorklist(EVE); ++OpsNarrowed; return SDValue(EVE, 0); } SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // (vextract (scalar_to_vector val, 0) -> val SDValue InVec = N->getOperand(0); EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. SDValue InOp = InVec.getOperand(0); if (InOp.getValueType() != NVT) { assert(InOp.getValueType().isInteger() && NVT.isInteger()); return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT); } return InOp; } SDValue EltNo = N->getOperand(1); bool ConstEltNo = isa(EltNo); // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD // patterns. For example on AVX, extracting elements from a wide vector // without using extract_subvector. However, if we can find an underlying // scalar value, then we can always use that. if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE && ConstEltNo) { int Elt = cast(EltNo)->getZExtValue(); int NumElem = VT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(InVec); // Find the new index to extract from. int OrigElt = SVOp->getMaskElt(Elt); // Extracting an undef index is undef. if (OrigElt == -1) return DAG.getUNDEF(NVT); // Select the right vector half to extract from. SDValue SVInVec; if (OrigElt < NumElem) { SVInVec = InVec->getOperand(0); } else { SVInVec = InVec->getOperand(1); OrigElt -= NumElem; } if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { SDValue InOp = SVInVec.getOperand(OrigElt); if (InOp.getValueType() != NVT) { assert(InOp.getValueType().isInteger() && NVT.isInteger()); InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT); } return InOp; } // FIXME: We should handle recursing on other vector shuffles and // scalar_to_vector here as well. if (!LegalOperations) { EVT IndexTy = TLI.getVectorIdxTy(); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec, DAG.getConstant(OrigElt, IndexTy)); } } bool BCNumEltsChanged = false; EVT ExtVT = VT.getVectorElementType(); EVT LVT = ExtVT; // If the result of load has to be truncated, then it's not necessarily // profitable. if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT)) return SDValue(); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) return SDValue(); if (VT.getVectorNumElements() != BCVT.getVectorNumElements()) BCNumEltsChanged = true; InVec = InVec.getOperand(0); ExtVT = BCVT.getVectorElementType(); } // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size) if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() && ISD::isNormalLoad(InVec.getNode()) && !N->getOperand(1)->hasPredecessor(InVec.getNode())) { SDValue Index = N->getOperand(1); if (LoadSDNode *OrigLoad = dyn_cast(InVec)) return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index, OrigLoad); } // Perform only after legalization to ensure build_vector / vector_shuffle // optimizations have already been done. if (!LegalOperations) return SDValue(); // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) if (ConstEltNo) { int Elt = cast(EltNo)->getZExtValue(); LoadSDNode *LN0 = nullptr; const ShuffleVectorSDNode *SVN = nullptr; if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.getOperand(0).getValueType() == ExtVT && ISD::isNormalLoad(InVec.getOperand(0).getNode())) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); LN0 = cast(InVec.getOperand(0)); } else if ((SVN = dyn_cast(InVec))) { // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) // => // (load $addr+1*size) // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); // If the bit convert changed the number of elements, it is unsafe // to examine the mask. if (BCNumEltsChanged) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = VT.getVectorNumElements(); int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt); InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); InVec = InVec.getOperand(0); } if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems; EltNo = DAG.getConstant(Elt, EltNo.getValueType()); } } // Make sure we found a non-volatile load and the extractelement is // the only use. if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) return SDValue(); // If Idx was -1 above, Elt is going to be -1, so just return undef. if (Elt == -1) return DAG.getUNDEF(LVT); return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0); } return SDValue(); } // Simplify (build_vec (ext )) to (bitcast (build_vec )) SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { // We perform this optimization post type-legalization because // the type-legalizer often scalarizes integer-promoted vectors. // Performing this optimization before may create bit-casts which // will be type-legalized to complex code sequences. // We perform this optimization only before the operation legalizer because we // may introduce illegal operations. if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes) return SDValue(); unsigned NumInScalars = N->getNumOperands(); SDLoc dl(N); EVT VT = N->getValueType(0); // Check to see if this is a BUILD_VECTOR of a bunch of values // which come from any_extend or zero_extend nodes. If so, we can create // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR // optimizations. We do not handle sign-extend because we can't fill the sign // using shuffles. EVT SourceType = MVT::Other; bool AllAnyExt = true; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); // Ignore undef inputs. if (In.getOpcode() == ISD::UNDEF) continue; bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; // Abort if the element is not an extension. if (!ZeroExt && !AnyExt) { SourceType = MVT::Other; break; } // The input is a ZeroExt or AnyExt. Check the original type. EVT InTy = In.getOperand(0).getValueType(); // Check that all of the widened source types are the same. if (SourceType == MVT::Other) // First time. SourceType = InTy; else if (InTy != SourceType) { // Multiple income types. Abort. SourceType = MVT::Other; break; } // Check if all of the extends are ANY_EXTENDs. AllAnyExt &= AnyExt; } // In order to have valid types, all of the inputs must be extended from the // same source type and all of the inputs must be any or zero extend. // Scalar sizes must be a power of two. EVT OutScalarTy = VT.getScalarType(); bool ValidTypes = SourceType != MVT::Other && isPowerOf2_32(OutScalarTy.getSizeInBits()) && isPowerOf2_32(SourceType.getSizeInBits()); // Create a new simpler BUILD_VECTOR sequence which other optimizations can // turn into a single shuffle instruction. if (!ValidTypes) return SDValue(); bool isLE = TLI.isLittleEndian(); unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits(); assert(ElemRatio > 1 && "Invalid element size ratio"); SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType): DAG.getConstant(0, SourceType); unsigned NewBVElems = ElemRatio * VT.getVectorNumElements(); SmallVector Ops(NewBVElems, Filler); // Populate the new build_vector for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Cast = N->getOperand(i); assert((Cast.getOpcode() == ISD::ANY_EXTEND || Cast.getOpcode() == ISD::ZERO_EXTEND || Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode"); SDValue In; if (Cast.getOpcode() == ISD::UNDEF) In = DAG.getUNDEF(SourceType); else In = Cast->getOperand(0); unsigned Index = isLE ? (i * ElemRatio) : (i * ElemRatio + (ElemRatio - 1)); assert(Index < Ops.size() && "Invalid index"); Ops[Index] = In; } // The type of the new BUILD_VECTOR node. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems); assert(VecVT.getSizeInBits() == VT.getSizeInBits() && "Invalid vector size"); // Check if the new vector type is legal. if (!isTypeLegal(VecVT)) return SDValue(); // Make the new BUILD_VECTOR. SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); // The new BUILD_VECTOR node has the potential to be further optimized. AddToWorklist(BV.getNode()); // Bitcast to the desired type. return DAG.getNode(ISD::BITCAST, dl, VT, BV); } SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumInScalars = N->getNumOperands(); SDLoc dl(N); EVT SrcVT = MVT::Other; unsigned Opcode = ISD::DELETED_NODE; unsigned NumDefs = 0; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); unsigned Opc = In.getOpcode(); if (Opc == ISD::UNDEF) continue; // If all scalar values are floats and converted from integers. if (Opcode == ISD::DELETED_NODE && (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { Opcode = Opc; } if (Opc != Opcode) return SDValue(); EVT InVT = In.getOperand(0).getValueType(); // If all scalar values are typed differently, bail out. It's chosen to // simplify BUILD_VECTOR of integer types. if (SrcVT == MVT::Other) SrcVT = InVT; if (SrcVT != InVT) return SDValue(); NumDefs++; } // If the vector has just one element defined, it's not worth to fold it into // a vectorized one. if (NumDefs < 2) return SDValue(); assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP) && "Should only handle conversion from integer to float."); assert(SrcVT != MVT::Other && "Cannot determine source type!"); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) return SDValue(); SmallVector Opnds; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); if (In.getOpcode() == ISD::UNDEF) Opnds.push_back(DAG.getUNDEF(SrcVT)); else Opnds.push_back(In.getOperand(0)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Opnds); AddToWorklist(BV.getNode()); return DAG.getNode(Opcode, dl, VT, BV); } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { unsigned NumInScalars = N->getNumOperands(); SDLoc dl(N); EVT VT = N->getValueType(0); // A vector built entirely of undefs is undef. if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); SDValue V = reduceBuildVecExtToExtBuildVec(N); if (V.getNode()) return V; V = reduceBuildVecConvertToConvertBuildVec(N); if (V.getNode()) return V; // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT // operations. If so, and if the EXTRACT_VECTOR_ELT vector inputs come from // at most two distinct vectors, turn this into a shuffle node. // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. if (!isTypeLegal(VT)) return SDValue(); // May only combine to shuffle after legalize if shuffle is legal. if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) return SDValue(); SDValue VecIn1, VecIn2; bool UsesZeroVector = false; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue Op = N->getOperand(i); // Ignore undef inputs. if (Op.getOpcode() == ISD::UNDEF) continue; // See if we can combine this build_vector into a blend with a zero vector. if (!VecIn2.getNode() && ((Op.getOpcode() == ISD::Constant && cast(Op.getNode())->isNullValue()) || (Op.getOpcode() == ISD::ConstantFP && cast(Op.getNode())->getValueAPF().isZero()))) { UsesZeroVector = true; continue; } // If this input is something other than a EXTRACT_VECTOR_ELT with a // constant index, bail out. if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op.getOperand(1))) { VecIn1 = VecIn2 = SDValue(nullptr, 0); break; } // We allow up to two distinct input vectors. SDValue ExtractedFromVec = Op.getOperand(0); if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2) continue; if (!VecIn1.getNode()) { VecIn1 = ExtractedFromVec; } else if (!VecIn2.getNode() && !UsesZeroVector) { VecIn2 = ExtractedFromVec; } else { // Too many inputs. VecIn1 = VecIn2 = SDValue(nullptr, 0); break; } } // If everything is good, we can make a shuffle operation. if (VecIn1.getNode()) { unsigned InNumElements = VecIn1.getValueType().getVectorNumElements(); SmallVector Mask; for (unsigned i = 0; i != NumInScalars; ++i) { unsigned Opcode = N->getOperand(i).getOpcode(); if (Opcode == ISD::UNDEF) { Mask.push_back(-1); continue; } // Operands can also be zero. if (Opcode != ISD::EXTRACT_VECTOR_ELT) { assert(UsesZeroVector && (Opcode == ISD::Constant || Opcode == ISD::ConstantFP) && "Unexpected node found!"); Mask.push_back(NumInScalars+i); continue; } // If extracting from the first vector, just use the index directly. SDValue Extract = N->getOperand(i); SDValue ExtVal = Extract.getOperand(1); unsigned ExtIndex = cast(ExtVal)->getZExtValue(); if (Extract.getOperand(0) == VecIn1) { Mask.push_back(ExtIndex); continue; } // Otherwise, use InIdx + InputVecSize Mask.push_back(InNumElements + ExtIndex); } // Avoid introducing illegal shuffles with zero. if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT)) return SDValue(); // We can't generate a shuffle node with mismatched input and output types. // Attempt to transform a single input vector to the correct type. if ((VT != VecIn1.getValueType())) { // If the input vector type has a different base type to the output // vector type, bail out. EVT VTElemType = VT.getVectorElementType(); if ((VecIn1.getValueType().getVectorElementType() != VTElemType) || (VecIn2.getNode() && (VecIn2.getValueType().getVectorElementType() != VTElemType))) return SDValue(); // If the input vector is too small, widen it. // We only support widening of vectors which are half the size of the // output registers. For example XMM->YMM widening on X86 with AVX. EVT VecInT = VecIn1.getValueType(); if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) { // If we only have one small input, widen it by adding undef values. if (!VecIn2.getNode()) VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, DAG.getUNDEF(VecIn1.getValueType())); else if (VecIn1.getValueType() == VecIn2.getValueType()) { // If we have two small inputs of the same type, try to concat them. VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, VecIn2); VecIn2 = SDValue(nullptr, 0); } else return SDValue(); } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) { // If the input vector is too large, try to split it. // We don't support having two input vectors that are too large. if (VecIn2.getNode()) return SDValue(); if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements())) return SDValue(); // Try to replace VecIn1 with two extract_subvectors // No need to update the masks, they should still be correct. VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy())); VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, DAG.getConstant(0, TLI.getVectorIdxTy())); UsesZeroVector = false; } else return SDValue(); } if (UsesZeroVector) VecIn2 = VT.isInteger() ? DAG.getConstant(0, VT) : DAG.getConstantFP(0.0, VT); else // If VecIn2 is unused then change it to undef. VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); // Check that we were able to transform all incoming values to the same // type. if (VecIn2.getValueType() != VecIn1.getValueType() || VecIn1.getValueType() != VT) return SDValue(); // Return the new VECTOR_SHUFFLE node. SDValue Ops[2]; Ops[0] = VecIn1; Ops[1] = VecIn2; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], &Mask[0]); } return SDValue(); } SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of // EXTRACT_SUBVECTOR operations. If so, and if the EXTRACT_SUBVECTOR vector // inputs come from at most two distinct vectors, turn this into a shuffle // node. // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) return N->getOperand(0); // Check if all of the operands are undefs. EVT VT = N->getValueType(0); if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); // Optimize concat_vectors where one of the vectors is undef. if (N->getNumOperands() == 2 && N->getOperand(1)->getOpcode() == ISD::UNDEF) { SDValue In = N->getOperand(0); assert(In.getValueType().isVector() && "Must concat vectors"); // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr). if (In->getOpcode() == ISD::BITCAST && !In->getOperand(0)->getValueType(0).isVector()) { SDValue Scalar = In->getOperand(0); EVT SclTy = Scalar->getValueType(0); if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VT.getSizeInBits() / SclTy.getSizeInBits()); if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) return SDValue(); SDLoc dl = SDLoc(N); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NVT, Scalar); return DAG.getNode(ISD::BITCAST, dl, VT, Res); } } // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) if (N->getNumOperands() == 2 && N->getOperand(0).getOpcode() == ISD::BUILD_VECTOR && N->getOperand(1).getOpcode() == ISD::BUILD_VECTOR) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SmallVector Opnds; unsigned BuildVecNumElts = N0.getNumOperands(); EVT SclTy0 = N0.getOperand(0)->getValueType(0); EVT SclTy1 = N1.getOperand(0)->getValueType(0); if (SclTy0.isFloatingPoint()) { for (unsigned i = 0; i != BuildVecNumElts; ++i) Opnds.push_back(N0.getOperand(i)); for (unsigned i = 0; i != BuildVecNumElts; ++i) Opnds.push_back(N1.getOperand(i)); } else { // If BUILD_VECTOR are from built from integer, they may have different // operand types. Get the smaller type and truncate all operands to it. EVT MinTy = SclTy0.bitsLE(SclTy1) ? SclTy0 : SclTy1; for (unsigned i = 0; i != BuildVecNumElts; ++i) Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy, N0.getOperand(i))); for (unsigned i = 0; i != BuildVecNumElts; ++i) Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy, N1.getOperand(i))); } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds); } // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that // place the incoming vectors at the exact same location. SDValue SingleSource = SDValue(); unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); if (Op.getOpcode() == ISD::UNDEF) continue; // Check if this is the identity extract: if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) return SDValue(); // Find the single incoming vector for the extract_subvector. if (SingleSource.getNode()) { if (Op.getOperand(0) != SingleSource) return SDValue(); } else { SingleSource = Op.getOperand(0); // Check the source type is the same as the type of the result. // If not, this concat may extend the vector, so we can not // optimize it away. if (SingleSource.getValueType() != N->getValueType(0)) return SDValue(); } unsigned IdentityIndex = i * PartNumElem; ConstantSDNode *CS = dyn_cast(Op.getOperand(1)); // The extract index must be constant. if (!CS) return SDValue(); // Check that we are reading from the identity index. if (CS->getZExtValue() != IdentityIndex) return SDValue(); } if (SingleSource.getNode()) return SingleSource; return SDValue(); } SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); if (V->getOpcode() == ISD::CONCAT_VECTORS) { // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: // Vi if possible // Only operand 0 is checked as 'concat' assumes all inputs of the same // type. if (V->getOperand(0).getValueType() != NVT) return SDValue(); unsigned Idx = dyn_cast(N->getOperand(1))->getZExtValue(); unsigned NumElems = NVT.getVectorNumElements(); assert((Idx % NumElems) == 0 && "IDX in concat is not a multiple of the result vector length."); return V->getOperand(Idx / NumElems); } // Skip bitcasting if (V->getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { SDLoc dl(N); // Handle only simple case where vector being inserted and vector // being extracted are of same type, and are half size of larger vectors. EVT BigVT = V->getOperand(0).getValueType(); EVT SmallVT = V->getOperand(1).getValueType(); if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits()) return SDValue(); // Only handle cases where both indexes are constants with the same type. ConstantSDNode *ExtIdx = dyn_cast(N->getOperand(1)); ConstantSDNode *InsIdx = dyn_cast(V->getOperand(2)); if (InsIdx && ExtIdx && InsIdx->getValueType(0).getSizeInBits() <= 64 && ExtIdx->getValueType(0).getSizeInBits() <= 64) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) // Into: // indices are equal or bit offsets are equal => V1 // otherwise => (extract_subvec V1, ExtIdx) if (InsIdx->getZExtValue() * SmallVT.getScalarType().getSizeInBits() == ExtIdx->getZExtValue() * NVT.getScalarType().getSizeInBits()) return DAG.getNode(ISD::BITCAST, dl, NVT, V->getOperand(1)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, DAG.getNode(ISD::BITCAST, dl, N->getOperand(0).getValueType(), V->getOperand(0)), N->getOperand(1)); } } return SDValue(); } static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements, SDValue V, SelectionDAG &DAG) { SDLoc DL(V); EVT VT = V.getValueType(); switch (V.getOpcode()) { default: return V; case ISD::CONCAT_VECTORS: { EVT OpVT = V->getOperand(0).getValueType(); int OpSize = OpVT.getVectorNumElements(); SmallBitVector OpUsedElements(OpSize, false); bool FoundSimplification = false; SmallVector NewOps; NewOps.reserve(V->getNumOperands()); for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) { SDValue Op = V->getOperand(i); bool OpUsed = false; for (int j = 0; j < OpSize; ++j) if (UsedElements[i * OpSize + j]) { OpUsedElements[j] = true; OpUsed = true; } NewOps.push_back( OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG) : DAG.getUNDEF(OpVT)); FoundSimplification |= Op == NewOps.back(); OpUsedElements.reset(); } if (FoundSimplification) V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps); return V; } case ISD::INSERT_SUBVECTOR: { SDValue BaseV = V->getOperand(0); SDValue SubV = V->getOperand(1); auto *IdxN = dyn_cast(V->getOperand(2)); if (!IdxN) return V; int SubSize = SubV.getValueType().getVectorNumElements(); int Idx = IdxN->getZExtValue(); bool SubVectorUsed = false; SmallBitVector SubUsedElements(SubSize, false); for (int i = 0; i < SubSize; ++i) if (UsedElements[i + Idx]) { SubVectorUsed = true; SubUsedElements[i] = true; UsedElements[i + Idx] = false; } // Now recurse on both the base and sub vectors. SDValue SimplifiedSubV = SubVectorUsed ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG) : DAG.getUNDEF(SubV.getValueType()); SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG); if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV) V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, SimplifiedBaseV, SimplifiedSubV, V->getOperand(2)); return V; } } } static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, SDValue N1, SelectionDAG &DAG) { EVT VT = SVN->getValueType(0); int NumElts = VT.getVectorNumElements(); SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false); for (int M : SVN->getMask()) if (M >= 0 && M < NumElts) N0UsedElements[M] = true; else if (M >= NumElts) N1UsedElements[M - NumElts] = true; SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG); SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG); if (S0 == N0 && S1 == N1) return SDValue(); return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); } // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ShuffleVectorSDNode *SVN = cast(N); SmallVector Ops; EVT ConcatVT = N0.getOperand(0).getValueType(); unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { // Make sure we're dealing with a copy. unsigned Begin = I * NumElemsPerConcat; bool AllUndef = true, NoUndef = true; for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) { if (SVN->getMaskElt(J) >= 0) AllUndef = false; else NoUndef = false; } if (NoUndef) { if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0) return SDValue(); for (unsigned J = 1; J != NumElemsPerConcat; ++J) if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J)) return SDValue(); unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat; if (FirstElt < N0.getNumOperands()) Ops.push_back(N0.getOperand(FirstElt)); else Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands())); } else if (AllUndef) { Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType())); } else { // Mixed with general masks and undefs, can't do optimization. return SDValue(); } } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG"); // Canonicalize shuffle undef, undef -> undef if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(VT); ShuffleVectorSDNode *SVN = cast(N); // Canonicalize shuffle v, v -> v, undef if (N0 == N1) { SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= (int)NumElts) Idx -= NumElts; NewMask.push_back(Idx); } return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), &NewMask[0]); } // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. if (N0.getOpcode() == ISD::UNDEF) { SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= 0) { if (Idx >= (int)NumElts) Idx -= NumElts; else Idx = -1; // remove reference to lhs } NewMask.push_back(Idx); } return DAG.getVectorShuffle(VT, SDLoc(N), N1, DAG.getUNDEF(VT), &NewMask[0]); } // Remove references to rhs if it is undef if (N1.getOpcode() == ISD::UNDEF) { bool Changed = false; SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= (int)NumElts) { Idx = -1; Changed = true; } NewMask.push_back(Idx); } if (Changed) return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, &NewMask[0]); } // If it is a splat, check if the argument vector is another splat or a // build_vector with all scalar elements the same. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { SDNode *V = N0.getNode(); // If this is a bit convert that changes the element type of the vector but // not the number of vector elements, look through it. Be careful not to // look though conversions that change things like v4f32 to v2f64. if (V->getOpcode() == ISD::BITCAST) { SDValue ConvInput = V->getOperand(0); if (ConvInput.getValueType().isVector() && ConvInput.getValueType().getVectorNumElements() == NumElts) V = ConvInput.getNode(); } if (V->getOpcode() == ISD::BUILD_VECTOR) { assert(V->getNumOperands() == NumElts && "BUILD_VECTOR has wrong number of operands"); SDValue Base; bool AllSame = true; for (unsigned i = 0; i != NumElts; ++i) { if (V->getOperand(i).getOpcode() != ISD::UNDEF) { Base = V->getOperand(i); break; } } // Splat of , return if (!Base.getNode()) return N0; for (unsigned i = 0; i != NumElts; ++i) { if (V->getOperand(i) != Base) { AllSame = false; break; } } // Splat of , return if (AllSame) return N0; } } // There are various patterns used to build up a vector from smaller vectors, // subvectors, or elements. Scan chains of these and replace unused insertions // or components with undef. if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG)) return S; if (N0.getOpcode() == ISD::CONCAT_VECTORS && Level < AfterLegalizeVectorOps && (N1.getOpcode() == ISD::UNDEF || (N1.getOpcode() == ISD::CONCAT_VECTORS && N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { SDValue V = partitionShuffleOfConcats(N, DAG); if (V.getNode()) return V; } // Canonicalize shuffles according to rules: // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(N1->getOperand(0).getValueType() == VT && "Shuffle types don't match"); SDValue SV0 = N1->getOperand(0); SDValue SV1 = N1->getOperand(1); bool HasSameOp0 = N0 == SV0; bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF; if (HasSameOp0 || IsSV1Undef || N0 == SV1) // Commute the operands of this shuffle so that next rule // will trigger. return DAG.getCommutedVectorShuffle(*SVN); } // Try to fold according to rules: // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) // Don't try to fold shuffles with illegal type. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { ShuffleVectorSDNode *OtherSV = cast(N0); // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(OtherSV->getOperand(0).getValueType() == VT && "Shuffle types don't match"); SDValue SV0, SV1; SmallVector Mask; // Compute the combined shuffle mask for a shuffle with SV0 as the first // operand, and SV1 as the second operand. for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } SDValue CurrentVec; if (Idx < (int)NumElts) { // This shuffle index refers to the inner shuffle N0. Lookup the inner // shuffle mask to identify which vector is actually referenced. Idx = OtherSV->getMaskElt(Idx); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0) : OtherSV->getOperand(1); } else { // This shuffle index references an element within N1. CurrentVec = N1; } // Simple case where 'CurrentVec' is UNDEF. if (CurrentVec.getOpcode() == ISD::UNDEF) { Mask.push_back(-1); continue; } // Canonicalize the shuffle index. We don't know yet if CurrentVec // will be the first or second operand of the combined shuffle. Idx = Idx % NumElts; if (!SV0.getNode() || SV0 == CurrentVec) { // Ok. CurrentVec is the left hand side. // Update the mask accordingly. SV0 = CurrentVec; Mask.push_back(Idx); continue; } // Bail out if we cannot convert the shuffle pair into a single shuffle. if (SV1.getNode() && SV1 != CurrentVec) return SDValue(); // Ok. CurrentVec is the right hand side. // Update the mask accordingly. SV1 = CurrentVec; Mask.push_back(Idx + NumElts); } // Check if all indices in Mask are Undef. In case, propagate Undef. bool isUndefMask = true; for (unsigned i = 0; i != NumElts && isUndefMask; ++i) isUndefMask &= Mask[i] < 0; if (isUndefMask) return DAG.getUNDEF(VT); if (!SV0.getNode()) SV0 = DAG.getUNDEF(VT); if (!SV1.getNode()) SV1 = DAG.getUNDEF(VT); // Avoid introducing shuffles with illegal mask. if (!TLI.isShuffleMaskLegal(Mask, VT)) { // Compute the commuted shuffle mask and test again. for (unsigned i = 0; i != NumElts; ++i) { int idx = Mask[i]; if (idx < 0) continue; else if (idx < (int)NumElts) Mask[i] = idx + NumElts; else Mask[i] = idx - NumElts; } if (!TLI.isShuffleMaskLegal(Mask, VT)) return SDValue(); // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) std::swap(SV0, SV1); } // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]); } return SDValue(); } SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N2 = N->getOperand(2); // If the input vector is a concatenation, and the insert replaces // one of the halves, we can optimize into a single concat_vectors. if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 && N2.getOpcode() == ISD::Constant) { APInt InsIdx = cast(N2)->getAPIntValue(); EVT VT = N->getValueType(0); // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) -> // (concat_vectors Z, Y) if (InsIdx == 0) return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N->getOperand(1), N0.getOperand(1)); // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) -> // (concat_vectors X, Z) if (InsIdx == VT.getVectorNumElements()/2) return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0), N->getOperand(1)); } return SDValue(); } /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> /// vector_shuffle V, Zero, <0, 4, 2, 4> SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { EVT VT = N->getValueType(0); SDLoc dl(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (N->getOpcode() == ISD::AND) { if (RHS.getOpcode() == ISD::BITCAST) RHS = RHS.getOperand(0); if (RHS.getOpcode() == ISD::BUILD_VECTOR) { SmallVector Indices; unsigned NumElts = RHS.getNumOperands(); for (unsigned i = 0; i != NumElts; ++i) { SDValue Elt = RHS.getOperand(i); if (!isa(Elt)) return SDValue(); if (cast(Elt)->isAllOnesValue()) Indices.push_back(i); else if (cast(Elt)->isNullValue()) Indices.push_back(NumElts+i); else return SDValue(); } // Let's see if the target supports this vector_shuffle. EVT RVT = RHS.getValueType(); if (!TLI.isVectorClearMaskLegal(Indices, RVT)) return SDValue(); // Return the new VECTOR_SHUFFLE node. EVT EltVT = RVT.getVectorElementType(); SmallVector ZeroOps(RVT.getVectorNumElements(), DAG.getConstant(0, EltVT)); SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), RVT, ZeroOps); LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS); SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]); return DAG.getNode(ISD::BITCAST, dl, VT, Shuf); } } return SDValue(); } /// Visit a binary vector operation, like ADD. SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { assert(N->getValueType(0).isVector() && "SimplifyVBinOp only works on vectors!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Shuffle = XformToShuffleWithZero(N); if (Shuffle.getNode()) return Shuffle; // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold // this operation. if (LHS.getOpcode() == ISD::BUILD_VECTOR && RHS.getOpcode() == ISD::BUILD_VECTOR) { // Check if both vectors are constants. If not bail out. if (!(cast(LHS)->isConstant() && cast(RHS)->isConstant())) return SDValue(); SmallVector Ops; for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) { SDValue LHSOp = LHS.getOperand(i); SDValue RHSOp = RHS.getOperand(i); // Can't fold divide by zero. if (N->getOpcode() == ISD::SDIV || N->getOpcode() == ISD::UDIV || N->getOpcode() == ISD::FDIV) { if ((RHSOp.getOpcode() == ISD::Constant && cast(RHSOp.getNode())->isNullValue()) || (RHSOp.getOpcode() == ISD::ConstantFP && cast(RHSOp.getNode())->getValueAPF().isZero())) break; } EVT VT = LHSOp.getValueType(); EVT RVT = RHSOp.getValueType(); if (RVT != VT) { // Integer BUILD_VECTOR operands may have types larger than the element // size (e.g., when the element type is not legal). Prior to type // legalization, the types may not match between the two BUILD_VECTORS. // Truncate one of the operands to make them match. if (RVT.getSizeInBits() > VT.getSizeInBits()) { RHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, RHSOp); } else { LHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), RVT, LHSOp); VT = RVT; } } SDValue FoldOp = DAG.getNode(N->getOpcode(), SDLoc(LHS), VT, LHSOp, RHSOp); if (FoldOp.getOpcode() != ISD::UNDEF && FoldOp.getOpcode() != ISD::Constant && FoldOp.getOpcode() != ISD::ConstantFP) break; Ops.push_back(FoldOp); AddToWorklist(FoldOp.getNode()); } if (Ops.size() == LHS.getNumOperands()) return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops); } // Type legalization might introduce new shuffles in the DAG. // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) // -> (shuffle (VBinOp (A, B)), Undef, Mask). if (LegalTypes && isa(LHS) && isa(RHS) && LHS.hasOneUse() && RHS.hasOneUse() && LHS.getOperand(1).getOpcode() == ISD::UNDEF && RHS.getOperand(1).getOpcode() == ISD::UNDEF) { ShuffleVectorSDNode *SVN0 = cast(LHS); ShuffleVectorSDNode *SVN1 = cast(RHS); if (SVN0->getMask().equals(SVN1->getMask())) { EVT VT = N->getValueType(0); SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS.getOperand(0), RHS.getOperand(0)); AddUsersToWorklist(N); return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, &SVN0->getMask()[0]); } } return SDValue(); } /// Visit a binary vector operation, like FABS/FNEG. SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) { assert(N->getValueType(0).isVector() && "SimplifyVUnaryOp only works on vectors!"); SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // Operand is a BUILD_VECTOR node, see if we can constant fold it. SmallVector Ops; for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { SDValue Op = N0.getOperand(i); if (Op.getOpcode() != ISD::UNDEF && Op.getOpcode() != ISD::ConstantFP) break; EVT EltVT = Op.getValueType(); SDValue FoldOp = DAG.getNode(N->getOpcode(), SDLoc(N0), EltVT, Op); if (FoldOp.getOpcode() != ISD::UNDEF && FoldOp.getOpcode() != ISD::ConstantFP) break; Ops.push_back(FoldOp); AddToWorklist(FoldOp.getNode()); } if (Ops.size() != N0.getNumOperands()) return SDValue(); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N0.getValueType(), Ops); } SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0, SDValue N1, SDValue N2){ assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!"); SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2, cast(N0.getOperand(2))->get()); // If we got a simplified select_cc node back from SimplifySelectCC, then // break it down into a new SETCC node, and a new SELECT node, and then return // the SELECT node, since we were called with a SELECT node. if (SCC.getNode()) { // Check to see if we got a select_cc back (to turn into setcc/select). // Otherwise, just return whatever node we got back, like fabs. if (SCC.getOpcode() == ISD::SELECT_CC) { SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), N0.getValueType(), SCC.getOperand(0), SCC.getOperand(1), SCC.getOperand(4)); AddToWorklist(SETCC.getNode()); return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, SCC.getOperand(2), SCC.getOperand(3)); } return SCC; } return SDValue(); } /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values /// being selected between, see if we can simplify the select. Callers of this /// should assume that TheSelect is deleted if this returns true. As such, they /// should return the appropriate thing (e.g. the node) back to the top-level of /// the DAG combiner loop to avoid it being looked at. bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, SDValue RHS) { // Cannot simplify select with vector condition if (TheSelect->getOperand(0).getValueType().isVector()) return false; // If this is a select from two identical things, try to pull the operation // through the select. if (LHS.getOpcode() != RHS.getOpcode() || !LHS.hasOneUse() || !RHS.hasOneUse()) return false; // If this is a load and the token chain is identical, replace the select // of two loads with a load through a select of the address to load from. // This triggers in things like "select bool X, 10.0, 123.0" after the FP // constants have been dropped into the constant pool. if (LHS.getOpcode() == ISD::LOAD) { LoadSDNode *LLD = cast(LHS); LoadSDNode *RLD = cast(RHS); // Token chains must be identical. if (LHS.getOperand(0) != RHS.getOperand(0) || // Do not let this transformation reduce the number of volatile loads. LLD->isVolatile() || RLD->isVolatile() || // If this is an EXTLOAD, the VT's must match. LLD->getMemoryVT() != RLD->getMemoryVT() || // If this is an EXTLOAD, the kind of extension must match. (LLD->getExtensionType() != RLD->getExtensionType() && // The only exception is if one of the extensions is anyext. LLD->getExtensionType() != ISD::EXTLOAD && RLD->getExtensionType() != ISD::EXTLOAD) || // FIXME: this discards src value information. This is // over-conservative. It would be beneficial to be able to remember // both potential memory locations. Since we are discarding // src value info, don't do the transformation if the memory // locations are not in the default address space. LLD->getPointerInfo().getAddrSpace() != 0 || RLD->getPointerInfo().getAddrSpace() != 0 || !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), LLD->getBasePtr().getValueType())) return false; // Check that the select condition doesn't reach either load. If so, // folding this will induce a cycle into the DAG. If not, this is safe to // xform, so create a select of the addresses. SDValue Addr; if (TheSelect->getOpcode() == ISD::SELECT) { SDNode *CondNode = TheSelect->getOperand(0).getNode(); if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) || (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode))) return false; // The loads must not depend on one another. if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD)) return false; Addr = DAG.getSelect(SDLoc(TheSelect), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), LLD->getBasePtr(), RLD->getBasePtr()); } else { // Otherwise SELECT_CC SDNode *CondLHS = TheSelect->getOperand(0).getNode(); SDNode *CondRHS = TheSelect->getOperand(1).getNode(); if ((LLD->hasAnyUseOfValue(1) && (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) || (RLD->hasAnyUseOfValue(1) && (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS)))) return false; Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), TheSelect->getOperand(1), LLD->getBasePtr(), RLD->getBasePtr(), TheSelect->getOperand(4)); } SDValue Load; // It is safe to replace the two loads if they have different alignments, // but the new load must be the minimum (most restrictive) alignment of the // inputs. bool isInvariant = LLD->isInvariant() & RLD->isInvariant(); unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), // FIXME: Discards pointer and AA info. LLD->getChain(), Addr, MachinePointerInfo(), LLD->isVolatile(), LLD->isNonTemporal(), isInvariant, Alignment); } else { Load = DAG.getExtLoad(LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() : LLD->getExtensionType(), SDLoc(TheSelect), TheSelect->getValueType(0), // FIXME: Discards pointer and AA info. LLD->getChain(), Addr, MachinePointerInfo(), LLD->getMemoryVT(), LLD->isVolatile(), LLD->isNonTemporal(), isInvariant, Alignment); } // Users of the select now use the result of the load. CombineTo(TheSelect, Load); // Users of the old loads now use the new load's chain. We know the // old-load value is dead now. CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1)); CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1)); return true; } return false; } /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 /// where 'cond' is the comparison specified by CC. SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare) { // (x ? y : y) -> y. if (N2 == N3) return N2; EVT VT = N2.getValueType(); ConstantSDNode *N1C = dyn_cast(N1.getNode()); ConstantSDNode *N2C = dyn_cast(N2.getNode()); ConstantSDNode *N3C = dyn_cast(N3.getNode()); // Determine if the condition we're dealing with is constant SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, DL, false); if (SCC.getNode()) AddToWorklist(SCC.getNode()); ConstantSDNode *SCCC = dyn_cast_or_null(SCC.getNode()); // fold select_cc true, x, y -> x if (SCCC && !SCCC->isNullValue()) return N2; // fold select_cc false, x, y -> y if (SCCC && SCCC->isNullValue()) return N3; // Check to see if we can simplify the select into an fabs node if (ConstantFPSDNode *CFP = dyn_cast(N1)) { // Allow either -0.0 or 0.0 if (CFP->getValueAPF().isZero()) { // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs if ((CC == ISD::SETGE || CC == ISD::SETGT) && N0 == N2 && N3.getOpcode() == ISD::FNEG && N2 == N3.getOperand(0)) return DAG.getNode(ISD::FABS, DL, VT, N0); // select (setl[te] X, +/-0.0), fneg(X), X -> fabs if ((CC == ISD::SETLT || CC == ISD::SETLE) && N0 == N3 && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N3) return DAG.getNode(ISD::FABS, DL, VT, N3); } } // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 // in it. This is a win when the constant is not otherwise available because // it replaces two constant pool loads with one. We only do this if the FP // type is known to be legal, because if it isn't, then we are before legalize // types an we want the other legalization to happen first (e.g. to avoid // messing with soft float) and if the ConstantFP is not legal, because if // it is legal, we may not need to store the FP constant in a constant pool. if (ConstantFPSDNode *TV = dyn_cast(N2)) if (ConstantFPSDNode *FV = dyn_cast(N3)) { if (TLI.isTypeLegal(N2.getValueType()) && (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) != TargetLowering::Legal && !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) && !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) && // If both constants have multiple uses, then we won't need to do an // extra load, they are likely around in registers for other users. (TV->hasOneUse() || FV->hasOneUse())) { Constant *Elts[] = { const_cast(FV->getConstantFPValue()), const_cast(TV->getConstantFPValue()) }; Type *FPTy = Elts[0]->getType(); const DataLayout &TD = *TLI.getDataLayout(); // Create a ConstantArray of the two constants. Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(), TD.getPrefTypeAlignment(FPTy)); unsigned Alignment = cast(CPIdx)->getAlignment(); // Get the offsets to the 0 and 1 element of the array so that we can // select between them. SDValue Zero = DAG.getIntPtrConstant(0); unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); SDValue One = DAG.getIntPtrConstant(EltSize); SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); AddToWorklist(Cond.getNode()); SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); AddToWorklist(CstOffset.getNode()); CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); AddToWorklist(CPIdx.getNode()); return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); } } // Check to see if we can perform the "gzip trick", transforming // (select_cc setlt X, 0, A, 0) -> (and (sra X, (sub size(X), 1), A) if (N1C && N3C && N3C->isNullValue() && CC == ISD::SETLT && (N1C->isNullValue() || // (a < 0) ? b : 0 (N1C->getAPIntValue() == 1 && N0 == N2))) { // (a < 1) ? a : 0 EVT XType = N0.getValueType(); EVT AType = N2.getValueType(); if (XType.bitsGE(AType)) { // and (sra X, size(X)-1, A) -> "and (srl X, C2), A" iff A is a // single-bit constant. if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue()-1)) == 0)) { unsigned ShCtV = N2C->getAPIntValue().logBase2(); ShCtV = XType.getSizeInBits()-ShCtV-1; SDValue ShCt = DAG.getConstant(ShCtV, getShiftAmountTy(N0.getValueType())); SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0), XType, N0, ShCt); AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); AddToWorklist(Shift.getNode()); } return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } SDValue Shift = DAG.getNode(ISD::SRA, SDLoc(N0), XType, N0, DAG.getConstant(XType.getSizeInBits()-1, getShiftAmountTy(N0.getValueType()))); AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); AddToWorklist(Shift.getNode()); } return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } } // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A) // where y is has a single bit set. // A plaintext description would be, we can turn the SELECT_CC into an AND // when the condition can be materialized as an all-ones register. Any // single bit-test can be materialized as an all-ones register with // shift-left and shift-right-arith. if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && N0->getValueType(0) == VT && N1C && N1C->isNullValue() && N2C && N2C->isNullValue()) { SDValue AndLHS = N0->getOperand(0); ConstantSDNode *ConstAndRHS = dyn_cast(N0->getOperand(1)); if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { // Shift the tested bit over the sign bit. APInt AndMask = ConstAndRHS->getAPIntValue(); SDValue ShlAmt = DAG.getConstant(AndMask.countLeadingZeros(), getShiftAmountTy(AndLHS.getValueType())); SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); // Now arithmetic right shift it all the way over, so the result is either // all-ones, or zero. SDValue ShrAmt = DAG.getConstant(AndMask.getBitWidth()-1, getShiftAmountTy(Shl.getValueType())); SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); return DAG.getNode(ISD::AND, DL, VT, Shr, N3); } } // fold select C, 16, 0 -> shl C, 4 if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() && TLI.getBooleanContents(N0.getValueType()) == TargetLowering::ZeroOrOneBooleanContent) { // If the caller doesn't want us to simplify this into a zext of a compare, // don't do it. if (NotExtCompare && N2C->getAPIntValue() == 1) return SDValue(); // Get a SetCC of the condition // NOTE: Don't create a SETCC if it's not legal on this target. if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, LegalTypes ? getSetCCResultType(N0.getValueType()) : MVT::i1)) { SDValue Temp, SCC; // cast from setcc result type to select result type if (LegalTypes) { SCC = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); if (N2.getValueType().bitsLT(SCC.getValueType())) Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), N2.getValueType()); else Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), N2.getValueType(), SCC); } else { SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), N2.getValueType(), SCC); } AddToWorklist(SCC.getNode()); AddToWorklist(Temp.getNode()); if (N2C->getAPIntValue() == 1) return Temp; // shl setcc result by log2 n2c return DAG.getNode( ISD::SHL, DL, N2.getValueType(), Temp, DAG.getConstant(N2C->getAPIntValue().logBase2(), getShiftAmountTy(Temp.getValueType()))); } } // Check to see if this is the equivalent of setcc // FIXME: Turn all of these into setcc if setcc if setcc is legal // otherwise, go ahead with the folds. if (0 && N3C && N3C->isNullValue() && N2C && (N2C->getAPIntValue() == 1ULL)) { EVT XType = N0.getValueType(); if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(XType))) { SDValue Res = DAG.getSetCC(DL, getSetCCResultType(XType), N0, N1, CC); if (Res.getValueType() != VT) Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res); return Res; } // fold (seteq X, 0) -> (srl (ctlz X, log2(size(X)))) if (N1C && N1C->isNullValue() && CC == ISD::SETEQ && (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, XType))) { SDValue Ctlz = DAG.getNode(ISD::CTLZ, SDLoc(N0), XType, N0); return DAG.getNode(ISD::SRL, DL, XType, Ctlz, DAG.getConstant(Log2_32(XType.getSizeInBits()), getShiftAmountTy(Ctlz.getValueType()))); } // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1)) if (N1C && N1C->isNullValue() && CC == ISD::SETGT) { SDValue NegN0 = DAG.getNode(ISD::SUB, SDLoc(N0), XType, DAG.getConstant(0, XType), N0); SDValue NotN0 = DAG.getNOT(SDLoc(N0), N0, XType); return DAG.getNode(ISD::SRL, DL, XType, DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0), DAG.getConstant(XType.getSizeInBits()-1, getShiftAmountTy(XType))); } // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1)) if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT) { SDValue Sign = DAG.getNode(ISD::SRL, SDLoc(N0), XType, N0, DAG.getConstant(XType.getSizeInBits()-1, getShiftAmountTy(N0.getValueType()))); return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, XType)); } } // Check to see if this is an integer abs. // select_cc setg[te] X, 0, X, -X -> // select_cc setgt X, -1, X, -X -> // select_cc setl[te] X, 0, -X, X -> // select_cc setlt X, 1, -X, X -> // Y = sra (X, size(X)-1); xor (add (X, Y), Y) if (N1C) { ConstantSDNode *SubC = nullptr; if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) || (N1C->isAllOnesValue() && CC == ISD::SETGT)) && N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) SubC = dyn_cast(N3.getOperand(0)); else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) || (N1C->isOne() && CC == ISD::SETLT)) && N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1)) SubC = dyn_cast(N2.getOperand(0)); EVT XType = N0.getValueType(); if (SubC && SubC->isNullValue() && XType.isInteger()) { SDValue Shift = DAG.getNode(ISD::SRA, SDLoc(N0), XType, N0, DAG.getConstant(XType.getSizeInBits()-1, getShiftAmountTy(N0.getValueType()))); SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), XType, N0, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, XType, Add, Shift); } } return SDValue(); } /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, SDLoc DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); } /// Given an ISD::SDIV node expressing a divide by constant, return /// a DAG expression to select that will generate the same value by multiplying /// by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue DAGCombiner::BuildSDIV(SDNode *N) { ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (!C->getAPIntValue()) return SDValue(); std::vector Built; SDValue S = TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a /// DAG expression that will generate the same value by right shifting. SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (!C->getAPIntValue()) return SDValue(); std::vector Built; SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Given an ISD::UDIV node expressing a divide by constant, return a DAG /// expression that will generate the same value by multiplying by a magic /// number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue DAGCombiner::BuildUDIV(SDNode *N) { ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (!C->getAPIntValue()) return SDValue(); std::vector Built; SDValue S = TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); for (SDNode *N : Built) AddToWorklist(N); return S; } SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) { if (Level >= AfterLegalizeDAG) return SDValue(); // Expose the DAG combiner to the target combiner implementations. TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this); unsigned Iterations = 0; if (SDValue Est = TLI.getRecipEstimate(Op, DCI, Iterations)) { if (Iterations) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal, we need to find the zero of the function: // F(X) = A X - 1 [which has a zero at X = 1/A] // => // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form // does not require additional intermediate precision] EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FPOne = DAG.getConstantFP(1.0, VT); AddToWorklist(Est.getNode()); // Newton iterations: Est = Est + Est (1 - Arg * Est) for (unsigned i = 0; i < Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); AddToWorklist(NewEst.getNode()); Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst); AddToWorklist(Est.getNode()); } } return Est; } return SDValue(); } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) /// As a result, we precompute A/2 prior to the iteration loop. SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue ThreeHalves = DAG.getConstantFP(1.5, VT); // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that // this entire sequence requires only one FP constant. SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg); AddToWorklist(HalfArg.getNode()); HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg); AddToWorklist(HalfArg.getNode()); // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) for (unsigned i = 0; i < Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst); AddToWorklist(NewEst.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); AddToWorklist(Est.getNode()); } return Est; } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue MinusThree = DAG.getConstantFP(-3.0, VT); SDValue MinusHalf = DAG.getConstantFP(-0.5, VT); // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est) for (unsigned i = 0; i < Iterations; ++i) { SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf); AddToWorklist(HalfEst.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); AddToWorklist(Est.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg); AddToWorklist(Est.getNode()); Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree); AddToWorklist(Est.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst); AddToWorklist(Est.getNode()); } return Est; } SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) { if (Level >= AfterLegalizeDAG) return SDValue(); // Expose the DAG combiner to the target combiner implementations. TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this); unsigned Iterations = 0; bool UseOneConstNR = false; if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) { AddToWorklist(Est.getNode()); if (Iterations) { Est = UseOneConstNR ? BuildRsqrtNROneConst(Op, Est, Iterations) : BuildRsqrtNRTwoConst(Op, Est, Iterations); } return Est; } return SDValue(); } /// Return true if base is a frame index, which is known not to alias with /// anything but itself. Provides base object and offset as results. static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, const GlobalValue *&GV, const void *&CV) { // Assume it is a primitive operation. Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr; // If it's an adding a simple constant then integrate the offset. if (Base.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast(Base.getOperand(1))) { Base = Base.getOperand(0); Offset += C->getZExtValue(); } } // Return the underlying GlobalValue, and update the Offset. Return false // for GlobalAddressSDNode since the same GlobalAddress may be represented // by multiple nodes with different offsets. if (GlobalAddressSDNode *G = dyn_cast(Base)) { GV = G->getGlobal(); Offset += G->getOffset(); return false; } // Return the underlying Constant value, and update the Offset. Return false // for ConstantSDNodes since the same constant pool entry may be represented // by multiple nodes with different offsets. if (ConstantPoolSDNode *C = dyn_cast(Base)) { CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal() : (const void *)C->getConstVal(); Offset += C->getOffset(); return false; } // If it's any of the following then it can't alias with anything but itself. return isa(Base); } /// Return true if there is any possibility that the two addresses overlap. bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { // If they are the same then they must be aliases. if (Op0->getBasePtr() == Op1->getBasePtr()) return true; // If they are both volatile then they cannot be reordered. if (Op0->isVolatile() && Op1->isVolatile()) return true; // Gather base node and offset information. SDValue Base1, Base2; int64_t Offset1, Offset2; const GlobalValue *GV1, *GV2; const void *CV1, *CV2; bool isFrameIndex1 = FindBaseOffset(Op0->getBasePtr(), Base1, Offset1, GV1, CV1); bool isFrameIndex2 = FindBaseOffset(Op1->getBasePtr(), Base2, Offset2, GV2, CV2); // If they have a same base address then check to see if they overlap. if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2))) return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 || (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1); // It is possible for different frame indices to alias each other, mostly // when tail call optimization reuses return address slots for arguments. // To catch this case, look up the actual index of frame indices to compute // the real alias relationship. if (isFrameIndex1 && isFrameIndex2) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); Offset1 += MFI->getObjectOffset(cast(Base1)->getIndex()); Offset2 += MFI->getObjectOffset(cast(Base2)->getIndex()); return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 || (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1); } // Otherwise, if we know what the bases are, and they aren't identical, then // we know they cannot alias. if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2)) return false; // If we know required SrcValue1 and SrcValue2 have relatively large alignment // compared to the size and offset of the access, we may be able to prove they // do not alias. This check is conservative for now to catch cases created by // splitting vector types. if ((Op0->getOriginalAlignment() == Op1->getOriginalAlignment()) && (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) && (Op0->getMemoryVT().getSizeInBits() >> 3 == Op1->getMemoryVT().getSizeInBits() >> 3) && (Op0->getOriginalAlignment() > Op0->getMemoryVT().getSizeInBits()) >> 3) { int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment(); int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment(); // There is no overlap between these relatively aligned accesses of similar // size, return no alias. if ((OffAlign1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign2 || (OffAlign2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign1) return false; } bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA : DAG.getSubtarget().useAA(); #ifndef NDEBUG if (CombinerAAOnlyFunc.getNumOccurrences() && CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif if (UseAA && Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { // Use alias analysis information. int64_t MinOffset = std::min(Op0->getSrcValueOffset(), Op1->getSrcValueOffset()); int64_t Overlap1 = (Op0->getMemoryVT().getSizeInBits() >> 3) + Op0->getSrcValueOffset() - MinOffset; int64_t Overlap2 = (Op1->getMemoryVT().getSizeInBits() >> 3) + Op1->getSrcValueOffset() - MinOffset; AliasAnalysis::AliasResult AAResult = AA.alias(AliasAnalysis::Location(Op0->getMemOperand()->getValue(), Overlap1, UseTBAA ? Op0->getAAInfo() : AAMDNodes()), AliasAnalysis::Location(Op1->getMemOperand()->getValue(), Overlap2, UseTBAA ? Op1->getAAInfo() : AAMDNodes())); if (AAResult == AliasAnalysis::NoAlias) return false; } // Otherwise we have to assume they alias. return true; } /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases) { SmallVector Chains; // List of chains to visit. SmallPtrSet Visited; // Visited node set. // Get alias information for node. bool IsLoad = isa(N) && !cast(N)->isVolatile(); // Starting off. Chains.push_back(OriginalChain); unsigned Depth = 0; // Look at each chain and determine if it is an alias. If so, add it to the // aliases list. If not, then continue up the chain looking for the next // candidate. while (!Chains.empty()) { SDValue Chain = Chains.back(); Chains.pop_back(); // For TokenFactor nodes, look at each operand and only continue up the // chain until we find two aliases. If we've seen two aliases, assume we'll // find more and revert to original chain since the xform is unlikely to be // profitable. // // FIXME: The depth check could be made to return the last non-aliasing // chain we found before we hit a tokenfactor rather than the original // chain. if (Depth > 6 || Aliases.size() == 2) { Aliases.clear(); Aliases.push_back(OriginalChain); return; } // Don't bother if we've been before. if (!Visited.insert(Chain.getNode()).second) continue; switch (Chain.getOpcode()) { case ISD::EntryToken: // Entry token is ideal chain operand, but handled in FindBetterChain. break; case ISD::LOAD: case ISD::STORE: { // Get alias information for Chain. bool IsOpLoad = isa(Chain.getNode()) && !cast(Chain.getNode())->isVolatile(); // If chain is alias then stop here. if (!(IsLoad && IsOpLoad) && isAlias(cast(N), cast(Chain.getNode()))) { Aliases.push_back(Chain); } else { // Look further up the chain. Chains.push_back(Chain.getOperand(0)); ++Depth; } break; } case ISD::TokenFactor: // We have to check each of the operands of the token factor for "small" // token factors, so we queue them up. Adding the operands to the queue // (stack) in reverse order maintains the original order and increases the // likelihood that getNode will find a matching token factor (CSE.) if (Chain.getNumOperands() > 16) { Aliases.push_back(Chain); break; } for (unsigned n = Chain.getNumOperands(); n;) Chains.push_back(Chain.getOperand(--n)); ++Depth; break; default: // For all other instructions we will just have to take what we can get. Aliases.push_back(Chain); break; } } // We need to be careful here to also search for aliases through the // value operand of a store, etc. Consider the following situation: // Token1 = ... // L1 = load Token1, %52 // S1 = store Token1, L1, %51 // L2 = load Token1, %52+8 // S2 = store Token1, L2, %51+8 // Token2 = Token(S1, S2) // L3 = load Token2, %53 // S3 = store Token2, L3, %52 // L4 = load Token2, %53+8 // S4 = store Token2, L4, %52+8 // If we search for aliases of S3 (which loads address %52), and we look // only through the chain, then we'll miss the trivial dependence on L1 // (which also loads from %52). We then might change all loads and // stores to use Token1 as their chain operand, which could result in // copying %53 into %52 before copying %52 into %51 (which should // happen first). // // The problem is, however, that searching for such data dependencies // can become expensive, and the cost is not directly related to the // chain depth. Instead, we'll rule out such configurations here by // insisting that we've visited all chain users (except for users // of the original chain, which is not necessary). When doing this, // we need to look through nodes we don't care about (otherwise, things // like register copies will interfere with trivial cases). SmallVector Worklist; for (const SDNode *N : Visited) if (N != OriginalChain.getNode()) Worklist.push_back(N); while (!Worklist.empty()) { const SDNode *M = Worklist.pop_back_val(); // We have already visited M, and want to make sure we've visited any uses // of M that we care about. For uses that we've not visisted, and don't // care about, queue them to the worklist. for (SDNode::use_iterator UI = M->use_begin(), UIE = M->use_end(); UI != UIE; ++UI) if (UI.getUse().getValueType() == MVT::Other && Visited.insert(*UI).second) { if (isa(*UI) || isa(*UI)) { // We've not visited this use, and we care about it (it could have an // ordering dependency with the original node). Aliases.clear(); Aliases.push_back(OriginalChain); return; } // We've not visited this use, but we don't care about it. Mark it as // visited and enqueue it to the worklist. Worklist.push_back(*UI); } } } /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain /// (aliasing node.) SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { SmallVector Aliases; // Ops for replacing token factor. // Accumulate all the aliases to this node. GatherAllAliases(N, OldChain, Aliases); // If no operands then chain to entry token. if (Aliases.size() == 0) return DAG.getEntryNode(); // If a single operand then chain to it. We don't need to revisit it. if (Aliases.size() == 1) return Aliases[0]; // Construct a custom tailored token factor. return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } /// This is the entry point for the file. void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA, CodeGenOpt::Level OptLevel) { /// This is the main entry point to this class. DAGCombiner(*this, AA, OptLevel).Run(Level); } Index: projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (revision 279025) @@ -1,3162 +1,3163 @@ //===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements integer type expansion and promotion for LegalizeTypes. // Promotion is the act of changing a computation in an illegal type into a // computation in a larger type. For example, implementing i8 arithmetic in an // i32 register (often needed on powerpc). // Expansion is the act of changing a computation in an illegal type into a // computation in two identical registers of a smaller type. For example, // implementing i64 arithmetic in two i32 registers (often needed on 32-bit // targets). // //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "legalize-types" //===----------------------------------------------------------------------===// // Integer Result Promotion //===----------------------------------------------------------------------===// /// PromoteIntegerResult - This method is called when a result of a node is /// found to be in need of promotion to a larger type. At this point, the node /// may also have invalid operands or may have other results that need /// expansion, we just know that (at least) one result needs promotion. void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(ResNo), true)) return; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "PromoteIntegerResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to promote this operator!"); case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break; case ISD::AssertSext: Res = PromoteIntRes_AssertSext(N); break; case ISD::AssertZext: Res = PromoteIntRes_AssertZext(N); break; case ISD::BITCAST: Res = PromoteIntRes_BITCAST(N); break; case ISD::BSWAP: Res = PromoteIntRes_BSWAP(N); break; case ISD::BUILD_PAIR: Res = PromoteIntRes_BUILD_PAIR(N); break; case ISD::Constant: Res = PromoteIntRes_Constant(N); break; case ISD::CONVERT_RNDSAT: Res = PromoteIntRes_CONVERT_RNDSAT(N); break; case ISD::CTLZ_ZERO_UNDEF: case ISD::CTLZ: Res = PromoteIntRes_CTLZ(N); break; case ISD::CTPOP: Res = PromoteIntRes_CTPOP(N); break; case ISD::CTTZ_ZERO_UNDEF: case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N));break; case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N));break; case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break; case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break; case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break; case ISD::SETCC: Res = PromoteIntRes_SETCC(N); break; case ISD::SHL: Res = PromoteIntRes_SHL(N); break; case ISD::SIGN_EXTEND_INREG: Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break; case ISD::SRA: Res = PromoteIntRes_SRA(N); break; case ISD::SRL: Res = PromoteIntRes_SRL(N); break; case ISD::TRUNCATE: Res = PromoteIntRes_TRUNCATE(N); break; case ISD::UNDEF: Res = PromoteIntRes_UNDEF(N); break; case ISD::VAARG: Res = PromoteIntRes_VAARG(N); break; case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; case ISD::VECTOR_SHUFFLE: Res = PromoteIntRes_VECTOR_SHUFFLE(N); break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break; case ISD::BUILD_VECTOR: Res = PromoteIntRes_BUILD_VECTOR(N); break; case ISD::SCALAR_TO_VECTOR: Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = PromoteIntRes_CONCAT_VECTORS(N); break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Res = PromoteIntRes_INT_EXTEND(N); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = PromoteIntRes_FP_TO_XINT(N); break; case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16(N); break; case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::ADD: case ISD::SUB: case ISD::MUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; case ISD::SDIV: case ISD::SREM: Res = PromoteIntRes_SDIV(N); break; case ISD::UDIV: case ISD::UREM: Res = PromoteIntRes_UDIV(N); break; case ISD::SADDO: case ISD::SSUBO: Res = PromoteIntRes_SADDSUBO(N, ResNo); break; case ISD::UADDO: case ISD::USUBO: Res = PromoteIntRes_UADDSUBO(N, ResNo); break; case ISD::SMULO: case ISD::UMULO: Res = PromoteIntRes_XMULO(N, ResNo); break; case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast(N)); break; case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_SWAP: Res = PromoteIntRes_Atomic1(cast(N)); break; case ISD::ATOMIC_CMP_SWAP: case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: Res = PromoteIntRes_AtomicCmpSwap(cast(N), ResNo); break; } // If the result is null then the sub-method took care of registering it. if (Res.getNode()) SetPromotedInteger(SDValue(N, ResNo), Res); } SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); return GetPromotedInteger(Op); } SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) { // Sign-extend the new bits, and continue the assertion. SDValue Op = SExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::AssertSext, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) { // Zero the new bits, and continue the assertion. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::AssertZext, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) { EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), ResVT, N->getChain(), N->getBasePtr(), N->getMemOperand(), N->getOrdering(), N->getSynchScope()); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) { SDValue Op2 = GetPromotedInteger(N->getOperand(2)); SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(), N->getOrdering(), N->getSynchScope()); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo) { if (ResNo == 1) { assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); EVT SVT = getSetCCResultType(N->getOperand(2).getValueType()); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); // Only use the result of getSetCCResultType if it is legal, // otherwise just use the promoted result type (NVT). if (!TLI.isTypeLegal(SVT)) SVT = NVT; SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other); SDValue Res = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs, N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3), N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(), N->getSynchScope()); ReplaceValueWith(SDValue(N, 0), Res.getValue(0)); ReplaceValueWith(SDValue(N, 2), Res.getValue(2)); return Res.getValue(1); } SDValue Op2 = GetPromotedInteger(N->getOperand(2)); SDValue Op3 = GetPromotedInteger(N->getOperand(3)); SDVTList VTs = DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other); SDValue Res = DAG.getAtomicCmpSwap( N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(), N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(), N->getSynchScope()); // Update the use to N with the newly created Res. for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i) ReplaceValueWith(SDValue(N, i), Res.getValue(i)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { SDValue InOp = N->getOperand(0); EVT InVT = InOp.getValueType(); EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); SDLoc dl(N); switch (getTypeAction(InVT)) { case TargetLowering::TypeLegal: break; case TargetLowering::TypePromoteInteger: if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector()) // The input promotes to the same size. Convert the promoted value. return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp)); break; case TargetLowering::TypeSoftenFloat: // Promote the integer operand by hand. return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp)); case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: break; case TargetLowering::TypeScalarizeVector: // Convert the element to an integer and promote it by hand. if (!NOutVT.isVector()) return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, BitConvertToInteger(GetScalarizedVector(InOp))); break; case TargetLowering::TypeSplitVector: { // For example, i32 = BITCAST v2i16 on alpha. Convert the split // pieces of the input into integers and reassemble in the final type. SDValue Lo, Hi; GetSplitVector(N->getOperand(0), Lo, Hi); Lo = BitConvertToInteger(Lo); Hi = BitConvertToInteger(Hi); if (TLI.isBigEndian()) std::swap(Lo, Hi); InOp = DAG.getNode(ISD::ANY_EXTEND, dl, EVT::getIntegerVT(*DAG.getContext(), NOutVT.getSizeInBits()), JoinIntegers(Lo, Hi)); return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp); } case TargetLowering::TypeWidenVector: // The input is widened to the same size. Convert to the widened value. // Make sure that the outgoing value is not a vector, because this would // make us bitcast between two vectors which are legalized in different ways. if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector()) return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp)); } return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, CreateStackStoreLoad(InOp, OutVT)); } SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); SDLoc dl(N); unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op), DAG.getConstant(DiffBits, TLI.getShiftAmountTy(NVT))); } SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) { // The pair element type may be legal, or may not promote to the same type as // the result, for example i14 = BUILD_PAIR (i7, i7). Handle all cases. return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), JoinIntegers(N->getOperand(0), N->getOperand(1))); } SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) { EVT VT = N->getValueType(0); // FIXME there is no actual debug info here SDLoc dl(N); // Zero extend things like i1, sign extend everything else. It shouldn't // matter in theory which one we pick, but this tends to give better code? unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue Result = DAG.getNode(Opc, dl, TLI.getTypeToTransformTo(*DAG.getContext(), VT), SDValue(N, 0)); assert(isa(Result) && "Didn't constant fold ext?"); return Result; } SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) { ISD::CvtCode CvtCode = cast(N)->getCvtCode(); assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU || CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU || CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) && "can only promote integers"); EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getConvertRndSat(OutVT, SDLoc(N), N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(4), CvtCode); } SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { // Zero extend to the promoted type and do the count there. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); SDLoc dl(N); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); Op = DAG.getNode(N->getOpcode(), dl, NVT, Op); // Subtract off the extra leading bits in the bigger type. return DAG.getNode( ISD::SUB, dl, NVT, Op, DAG.getConstant(NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), NVT)); } SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) { // Zero extend to the promoted type and do the count there. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op); } SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); SDLoc dl(N); if (N->getOpcode() == ISD::CTTZ) { // The count is the same in the promoted type except if the original // value was zero. This can be handled by setting the bit just off // the top of the original type. auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(), OVT.getScalarSizeInBits()); Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, NVT)); } return DAG.getNode(N->getOpcode(), dl, NVT, Op); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0), N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned NewOpc = N->getOpcode(); SDLoc dl(N); // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is // not Legal, check to see if we can use FP_TO_SINT instead. (If both UINT // and SINT conversions are Custom, there is no way to tell which is // preferable. We choose SINT because that's the right thing on PPC.) if (N->getOpcode() == ISD::FP_TO_UINT && !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) && TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT)) NewOpc = ISD::FP_TO_SINT; SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0)); // Assert that the converted value fits in the original type. If it doesn't // (eg: because the value being converted is too big), then the result of the // original operation was undefined anyway, so the assert is still correct. return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext : ISD::AssertSext, dl, NVT, Res, DAG.getValueType(N->getValueType(0).getScalarType())); } SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); return DAG.getNode(ISD::AssertZext, dl, NVT, Res, DAG.getValueType(N->getValueType(0))); } SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); if (getTypeAction(N->getOperand(0).getValueType()) == TargetLowering::TypePromoteInteger) { SDValue Res = GetPromotedInteger(N->getOperand(0)); assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!"); // If the result and operand types are the same after promotion, simplify // to an in-register extension. if (NVT == Res.getValueType()) { // The high bits are not guaranteed to be anything. Insert an extend. if (N->getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res, DAG.getValueType(N->getOperand(0).getValueType())); if (N->getOpcode() == ISD::ZERO_EXTEND) return DAG.getZeroExtendInReg(Res, dl, N->getOperand(0).getValueType().getScalarType()); assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!"); return Res; } } // Otherwise, just extend the original operand all the way to the larger type. return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) { assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!"); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType(); SDLoc dl(N); SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(), N->getMemoryVT(), N->getMemOperand()); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0()); - SDValue ExtMask = PromoteTargetBoolean(N->getMask(), NVT); + + SDValue Mask = N->getMask(); + EVT NewMaskVT = getSetCCResultType(NVT); + if (NewMaskVT != N->getMask().getValueType()) + Mask = PromoteTargetBoolean(Mask, NewMaskVT); SDLoc dl(N); - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(N->getPointerInfo(), - MachineMemOperand::MOLoad, NVT.getStoreSize(), - N->getAlignment(), N->getAAInfo(), N->getRanges()); - SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), - ExtMask, ExtSrc0, MMO); + Mask, ExtSrc0, N->getMemoryVT(), + N->getMemOperand(), ISD::SEXTLOAD); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } /// Promote the overflow flag of an overflowing arithmetic node. SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); EVT ValueVTs[] = { N->getValueType(0), NVT }; SDValue Ops[] = { N->getOperand(0), N->getOperand(1) }; SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), DAG.getVTList(ValueVTs), Ops); // Modified the sum result - switch anything that used the old sum to use // the new one. ReplaceValueWith(SDValue(N, 0), Res); return SDValue(Res.getNode(), 1); } SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); // The operation overflowed iff the result in the larger type is not the // sign extension of its truncation to the original type. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = SExtPromotedInteger(N->getOperand(1)); EVT OVT = N->getOperand(0).getValueType(); EVT NVT = LHS.getValueType(); SDLoc dl(N); // Do the arithmetic in the larger type. unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB; SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS); // Calculate the overflow flag: sign extend the arithmetic result from // the original type. SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res, DAG.getValueType(OVT)); // Overflowed if and only if this is not equal to Res. Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) { // Sign extend the input. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = SExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); return DAG.getSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) { SDValue Mask = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); // Promote all the way up to the canonical SetCC type. Mask = PromoteTargetBoolean(Mask, OpTy); SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); return DAG.getNode(ISD::VSELECT, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(2)); SDValue RHS = GetPromotedInteger(N->getOperand(3)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(), N->getOperand(0), N->getOperand(1), LHS, RHS, N->getOperand(4)); } SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) { EVT SVT = getSetCCResultType(N->getOperand(0).getValueType()); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); // Only use the result of getSetCCResultType if it is legal, // otherwise just use the promoted result type (NVT). if (!TLI.isTypeLegal(SVT)) SVT = NVT; SDLoc dl(N); assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() && "Vector compare must return a vector result!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS.getValueType() != RHS.getValueType()) { if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger && !LHS.getValueType().isVector()) LHS = GetPromotedInteger(LHS); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger && !RHS.getValueType().isVector()) RHS = GetPromotedInteger(RHS); } // Get the SETCC result using the canonical SETCC type. SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS, N->getOperand(2)); assert(NVT.bitsLE(SVT) && "Integer type overpromoted?"); // Convert to the expected type. return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC); } SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { SDValue Res = GetPromotedInteger(N->getOperand(0)); SDValue Amt = N->getOperand(1); Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt; return DAG.getNode(ISD::SHL, SDLoc(N), Res.getValueType(), Res, Amt); } SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) { // The input may have strange things in the top bits of the registers, but // these operations don't care. They may have weird bits going out, but // that too is okay if they are integer operations. SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = GetPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) { // The input value must be properly sign extended. SDValue Res = SExtPromotedInteger(N->getOperand(0)); SDValue Amt = N->getOperand(1); Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt; return DAG.getNode(ISD::SRA, SDLoc(N), Res.getValueType(), Res, Amt); } SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) { // The input value must be properly zero extended. SDValue Res = ZExtPromotedInteger(N->getOperand(0)); SDValue Amt = N->getOperand(1); Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt; return DAG.getNode(ISD::SRL, SDLoc(N), Res.getValueType(), Res, Amt); } SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Res; SDValue InOp = N->getOperand(0); SDLoc dl(N); switch (getTypeAction(InOp.getValueType())) { default: llvm_unreachable("Unknown type action!"); case TargetLowering::TypeLegal: case TargetLowering::TypeExpandInteger: Res = InOp; break; case TargetLowering::TypePromoteInteger: Res = GetPromotedInteger(InOp); break; case TargetLowering::TypeSplitVector: EVT InVT = InOp.getValueType(); assert(InVT.isVector() && "Cannot split scalar types"); unsigned NumElts = InVT.getVectorNumElements(); assert(NumElts == NVT.getVectorNumElements() && "Dst and Src must have the same number of elements"); assert(isPowerOf2_32(NumElts) && "Promoted vector type must be a power of two"); SDValue EOp1, EOp2; GetSplitVector(InOp, EOp1, EOp2); EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(), NumElts/2); EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1); EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2); } // Truncate to NVT instead of VT return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); } SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); // The operation overflowed iff the result in the larger type is not the // zero extension of its truncation to the original type. SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); EVT OVT = N->getOperand(0).getValueType(); EVT NVT = LHS.getValueType(); SDLoc dl(N); // Do the arithmetic in the larger type. unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB; SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS); // Calculate the overflow flag: zero extend the arithmetic result from // the original type. SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT); // Overflowed if and only if this is not equal to Res. Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { // Promote the overflow bit trivially. if (ResNo == 1) return PromoteIntRes_Overflow(N); SDValue LHS = N->getOperand(0), RHS = N->getOperand(1); SDLoc DL(N); EVT SmallVT = LHS.getValueType(); // To determine if the result overflowed in a larger type, we extend the // input to the larger type, do the multiply (checking if it overflows), // then also check the high bits of the result to see if overflow happened // there. if (N->getOpcode() == ISD::SMULO) { LHS = SExtPromotedInteger(LHS); RHS = SExtPromotedInteger(RHS); } else { LHS = ZExtPromotedInteger(LHS); RHS = ZExtPromotedInteger(RHS); } SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1)); SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS); // Overflow occurred if it occurred in the larger type, or if the high part // of the result does not zero/sign-extend the low part. Check this second // possibility first. SDValue Overflow; if (N->getOpcode() == ISD::UMULO) { // Unsigned overflow occurred if the high part is non-zero. SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul, DAG.getIntPtrConstant(SmallVT.getSizeInBits())); Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi, DAG.getConstant(0, Hi.getValueType()), ISD::SETNE); } else { // Signed overflow occurred if the high part does not sign extend the low. SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(), Mul, DAG.getValueType(SmallVT)); Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE); } // The only other way for overflow to occur is if the multiplication in the // larger type itself overflowed. Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow, SDValue(Mul.getNode(), 1)); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Overflow); return Mul; } SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) { // Zero extend the input. SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) { return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0))); } SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) { SDValue Chain = N->getOperand(0); // Get the chain. SDValue Ptr = N->getOperand(1); // Get the pointer. EVT VT = N->getValueType(0); SDLoc dl(N); MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT); unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT); // The argument is passed as NumRegs registers of type RegVT. SmallVector Parts(NumRegs); for (unsigned i = 0; i < NumRegs; ++i) { Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2), N->getConstantOperandVal(3)); Chain = Parts[i].getValue(1); } // Handle endianness of the load. if (TLI.isBigEndian()) std::reverse(Parts.begin(), Parts.end()); // Assemble the parts in the promoted type. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]); for (unsigned i = 1; i < NumRegs; ++i) { SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]); // Shift it to the right position and "or" it in. Part = DAG.getNode(ISD::SHL, dl, NVT, Part, DAG.getConstant(i * RegVT.getSizeInBits(), TLI.getPointerTy())); Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part); } // Modified the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Chain); return Res; } //===----------------------------------------------------------------------===// // Integer Operand Promotion //===----------------------------------------------------------------------===// /// PromoteIntegerOperand - This method is called when the specified operand of /// the specified node is found to need promotion. At this point, all of the /// result types of the node are known to be legal, but other operands of the /// node may need promotion or expansion as well as the specified one. bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) return false; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to promote this operator's operand!"); case ISD::ANY_EXTEND: Res = PromoteIntOp_ANY_EXTEND(N); break; case ISD::ATOMIC_STORE: Res = PromoteIntOp_ATOMIC_STORE(cast(N)); break; case ISD::BITCAST: Res = PromoteIntOp_BITCAST(N); break; case ISD::BR_CC: Res = PromoteIntOp_BR_CC(N, OpNo); break; case ISD::BRCOND: Res = PromoteIntOp_BRCOND(N, OpNo); break; case ISD::BUILD_PAIR: Res = PromoteIntOp_BUILD_PAIR(N); break; case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break; case ISD::CONVERT_RNDSAT: Res = PromoteIntOp_CONVERT_RNDSAT(N); break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break; case ISD::SCALAR_TO_VECTOR: Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break; case ISD::VSELECT: case ISD::SELECT: Res = PromoteIntOp_SELECT(N, OpNo); break; case ISD::SELECT_CC: Res = PromoteIntOp_SELECT_CC(N, OpNo); break; case ISD::SETCC: Res = PromoteIntOp_SETCC(N, OpNo); break; case ISD::SIGN_EXTEND: Res = PromoteIntOp_SIGN_EXTEND(N); break; case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N), OpNo); break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; case ISD::ZERO_EXTEND: Res = PromoteIntOp_ZERO_EXTEND(N); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } /// PromoteSetCCOperands - Promote the operands of a comparison. This code is /// shared among BR_CC, SELECT_CC, and SETCC handlers. void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS, ISD::CondCode CCCode) { // We have to insert explicit sign or zero extends. Note that we could // insert sign extends for ALL conditions, but zero extend is cheaper on // many machines (an AND instead of two shifts), so prefer it. switch (CCCode) { default: llvm_unreachable("Unknown integer comparison!"); case ISD::SETEQ: case ISD::SETNE: { SDValue OpL = GetPromotedInteger(NewLHS); SDValue OpR = GetPromotedInteger(NewRHS); // We would prefer to promote the comparison operand with sign extension, // if we find the operand is actually to truncate an AssertSext. With this // optimization, we can avoid inserting real truncate instruction, which // is redudant eventually. if (OpL->getOpcode() == ISD::AssertSext && cast(OpL->getOperand(1))->getVT() == NewLHS.getValueType() && OpR->getOpcode() == ISD::AssertSext && cast(OpR->getOperand(1))->getVT() == NewRHS.getValueType()) { NewLHS = OpL; NewRHS = OpR; } else { NewLHS = ZExtPromotedInteger(NewLHS); NewRHS = ZExtPromotedInteger(NewRHS); } break; } case ISD::SETUGE: case ISD::SETUGT: case ISD::SETULE: case ISD::SETULT: // ALL of these operations will work if we either sign or zero extend // the operands (including the unsigned comparisons!). Zero extend is // usually a simpler/cheaper operation, so prefer it. NewLHS = ZExtPromotedInteger(NewLHS); NewRHS = ZExtPromotedInteger(NewRHS); break; case ISD::SETGE: case ISD::SETGT: case ISD::SETLT: case ISD::SETLE: NewLHS = SExtPromotedInteger(NewLHS); NewRHS = SExtPromotedInteger(NewRHS); break; } } SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op); } SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) { SDValue Op2 = GetPromotedInteger(N->getOperand(2)); return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(), N->getOrdering(), N->getSynchScope()); } SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) { // This should only occur in unusual situations like bitcasting to an // x86_fp80, so just turn it into a store+load return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0)); } SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 2 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(2); SDValue RHS = N->getOperand(3); PromoteSetCCOperands(LHS, RHS, cast(N->getOperand(1))->get()); // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always // legal types. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), LHS, RHS, N->getOperand(4)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) { assert(OpNo == 1 && "only know how to promote condition"); // Promote all the way up to the canonical SetCC type. SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other); // The chain (Op#0) and basic block destination (Op#2) are always legal types. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond, N->getOperand(2)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) { // Since the result type is legal, the operands must promote to it. EVT OVT = N->getOperand(0).getValueType(); SDValue Lo = ZExtPromotedInteger(N->getOperand(0)); SDValue Hi = GetPromotedInteger(N->getOperand(1)); assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?"); SDLoc dl(N); Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi, DAG.getConstant(OVT.getSizeInBits(), TLI.getPointerTy())); return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi); } SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) { // The vector type is legal but the element type is not. This implies // that the vector is a power-of-two in length and that the element // type does not have a strange size (eg: it is not i1). EVT VecVT = N->getValueType(0); unsigned NumElts = VecVT.getVectorNumElements(); assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) && "Legal vector of one illegal element?"); // Promote the inserted value. The type does not need to match the // vector element type. Check that any extra bits introduced will be // truncated away. assert(N->getOperand(0).getValueType().getSizeInBits() >= N->getValueType(0).getVectorElementType().getSizeInBits() && "Type of inserted value narrower than vector element type!"); SmallVector NewOps; for (unsigned i = 0; i < NumElts; ++i) NewOps.push_back(GetPromotedInteger(N->getOperand(i))); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) { ISD::CvtCode CvtCode = cast(N)->getCvtCode(); assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU || CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU || CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) && "can only promote integer arguments"); SDValue InOp = GetPromotedInteger(N->getOperand(0)); return DAG.getConvertRndSat(N->getValueType(0), SDLoc(N), InOp, N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(4), CvtCode); } SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo) { if (OpNo == 1) { // Promote the inserted value. This is valid because the type does not // have to match the vector element type. // Check that any extra bits introduced will be truncated away. assert(N->getOperand(1).getValueType().getSizeInBits() >= N->getValueType(0).getVectorElementType().getSizeInBits() && "Type of inserted value narrower than vector element type!"); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), GetPromotedInteger(N->getOperand(1)), N->getOperand(2)), 0); } assert(OpNo == 2 && "Different operand and result vector types?"); // Promote the index. SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N), TLI.getVectorIdxTy()); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Idx), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) { // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote // the operand in place. return SDValue(DAG.UpdateNodeOperands(N, GetPromotedInteger(N->getOperand(0))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Only know how to promote the condition!"); SDValue Cond = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); // Promote all the way up to the canonical SetCC type. EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; Cond = PromoteTargetBoolean(Cond, OpVT); return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); PromoteSetCCOperands(LHS, RHS, cast(N->getOperand(4))->get()); // The CC (#4) and the possible return values (#2 and #3) have legal types. return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2), N->getOperand(3), N->getOperand(4)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); PromoteSetCCOperands(LHS, RHS, cast(N->getOperand(2))->get()); // The CC (#2) is always legal. return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), ZExtPromotedInteger(N->getOperand(1))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); SDLoc dl(N); Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op); return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op, DAG.getValueType(N->getOperand(0).getValueType())); } SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, SExtPromotedInteger(N->getOperand(0))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!"); SDValue Ch = N->getChain(), Ptr = N->getBasePtr(); SDLoc dl(N); SDValue Val = GetPromotedInteger(N->getValue()); // Get promoted value. // Truncate the value and store the result. return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getMemoryVT(), N->getMemOperand()); } SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){ assert(OpNo == 2 && "Only know how to promote the mask!"); - SDValue DataOp = N->getData(); + SDValue DataOp = N->getValue(); EVT DataVT = DataOp.getValueType(); SDValue Mask = N->getMask(); EVT MaskVT = Mask.getValueType(); SDLoc dl(N); + bool TruncateStore = false; if (!TLI.isTypeLegal(DataVT)) { if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) { DataOp = GetPromotedInteger(DataOp); Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); + TruncateStore = true; } else { assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector && "Unexpected data legalization in MSTORE"); DataOp = GetWidenedVector(DataOp); if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) Mask = GetWidenedVector(Mask); else { EVT BoolVT = getSetCCResultType(DataOp.getValueType()); // We can't use ModifyToType() because we should fill the mask with // zeroes unsigned WidenNumElts = BoolVT.getVectorNumElements(); unsigned MaskNumElts = MaskVT.getVectorNumElements(); unsigned NumConcat = WidenNumElts / MaskNumElts; SmallVector Ops(NumConcat); SDValue ZeroVal = DAG.getConstant(0, MaskVT); Ops[0] = Mask; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = ZeroVal; Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); } } } else Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType()); - SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[2] = Mask; - NewOps[3] = DataOp; - return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, + N->getMemoryVT(), N->getMemOperand(), + TruncateStore); } SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){ assert(OpNo == 2 && "Only know how to promote the mask!"); EVT DataVT = N->getValueType(0); SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[OpNo] = Mask; return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); } SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, ZExtPromotedInteger(N->getOperand(0))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) { SDLoc dl(N); SDValue Op = GetPromotedInteger(N->getOperand(0)); Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op); return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType().getScalarType()); } //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// /// ExpandIntegerResult - This method is called when the specified result of the /// specified node is found to need expansion. At this point, the node may also /// have invalid operands or may have other results that need promotion, we just /// know that (at least) one result needs expansion. void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG); dbgs() << "\n"); SDValue Lo, Hi; Lo = Hi = SDValue(); // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(ResNo), true)) return; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ExpandIntegerResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to expand the result of this operator!"); case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::BITCAST: ExpandRes_BITCAST(N, Lo, Hi); break; case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break; case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break; case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break; case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break; case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break; case ISD::AssertSext: ExpandIntRes_AssertSext(N, Lo, Hi); break; case ISD::AssertZext: ExpandIntRes_AssertZext(N, Lo, Hi); break; case ISD::BSWAP: ExpandIntRes_BSWAP(N, Lo, Hi); break; case ISD::Constant: ExpandIntRes_Constant(N, Lo, Hi); break; case ISD::CTLZ_ZERO_UNDEF: case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break; case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break; case ISD::CTTZ_ZERO_UNDEF: case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break; case ISD::FP_TO_SINT: ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break; case ISD::FP_TO_UINT: ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break; case ISD::LOAD: ExpandIntRes_LOAD(cast(N), Lo, Hi); break; case ISD::MUL: ExpandIntRes_MUL(N, Lo, Hi); break; case ISD::SDIV: ExpandIntRes_SDIV(N, Lo, Hi); break; case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break; case ISD::SREM: ExpandIntRes_SREM(N, Lo, Hi); break; case ISD::TRUNCATE: ExpandIntRes_TRUNCATE(N, Lo, Hi); break; case ISD::UDIV: ExpandIntRes_UDIV(N, Lo, Hi); break; case ISD::UREM: ExpandIntRes_UREM(N, Lo, Hi); break; case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break; case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break; case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_SWAP: case ISD::ATOMIC_CMP_SWAP: { std::pair Tmp = ExpandAtomic(N); SplitInteger(Tmp.first, Lo, Hi); ReplaceValueWith(SDValue(N, 1), Tmp.second); break; } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { AtomicSDNode *AN = cast(N); SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other); SDValue Tmp = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs, N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), AN->getMemOperand(), AN->getSuccessOrdering(), AN->getFailureOrdering(), AN->getSynchScope()); // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine // success simply by comparing the loaded value against the ingoing // comparison. SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp, N->getOperand(2), ISD::SETEQ); SplitInteger(Tmp, Lo, Hi); ReplaceValueWith(SDValue(N, 1), Success); ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1)); break; } case ISD::AND: case ISD::OR: case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break; case ISD::ADD: case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break; case ISD::ADDC: case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break; case ISD::ADDE: case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break; case ISD::SADDO: case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break; case ISD::UADDO: case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break; case ISD::UMULO: case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. if (Lo.getNode()) SetExpandedInteger(SDValue(N, ResNo), Lo, Hi); } /// Lower an atomic node to the appropriate builtin call. std::pair DAGTypeLegalizer::ExpandAtomic(SDNode *Node) { unsigned Opc = Node->getOpcode(); MVT VT = cast(Node)->getMemoryVT().getSimpleVT(); RTLIB::Libcall LC; switch (Opc) { default: llvm_unreachable("Unhandled atomic intrinsic Expand!"); case ISD::ATOMIC_SWAP: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_1; break; case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break; case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break; case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break; case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break; } break; case ISD::ATOMIC_CMP_SWAP: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1; break; case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break; case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break; case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break; case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break; } break; case ISD::ATOMIC_LOAD_ADD: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_FETCH_AND_ADD_1; break; case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break; case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break; case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break; case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break; } break; case ISD::ATOMIC_LOAD_SUB: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_FETCH_AND_SUB_1; break; case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break; case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break; case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break; case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break; } break; case ISD::ATOMIC_LOAD_AND: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_FETCH_AND_AND_1; break; case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break; case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break; case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break; case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break; } break; case ISD::ATOMIC_LOAD_OR: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_FETCH_AND_OR_1; break; case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break; case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break; case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break; case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break; } break; case ISD::ATOMIC_LOAD_XOR: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_FETCH_AND_XOR_1; break; case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break; case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break; case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break; case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break; } break; case ISD::ATOMIC_LOAD_NAND: switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type for atomic!"); case MVT::i8: LC = RTLIB::SYNC_FETCH_AND_NAND_1; break; case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break; case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break; case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break; case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break; } break; } return ExpandChainLibCall(LC, Node, false); } /// ExpandShiftByConstant - N is a shift by a value that needs to be expanded, /// and the shift amount is a constant 'Amt'. Expand the operation. void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt, SDValue &Lo, SDValue &Hi) { assert(Amt && "Expected zero shifts to be already optimized away."); SDLoc DL(N); // Expand the incoming operand to be shifted, so that we have its parts SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); EVT NVT = InL.getValueType(); unsigned VTBits = N->getValueType(0).getSizeInBits(); unsigned NVTBits = NVT.getSizeInBits(); EVT ShTy = N->getOperand(1).getValueType(); if (N->getOpcode() == ISD::SHL) { if (Amt > VTBits) { Lo = Hi = DAG.getConstant(0, NVT); } else if (Amt > NVTBits) { Lo = DAG.getConstant(0, NVT); Hi = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt-NVTBits, ShTy)); } else if (Amt == NVTBits) { Lo = DAG.getConstant(0, NVT); Hi = InL; } else if (Amt == 1 && TLI.isOperationLegalOrCustom(ISD::ADDC, TLI.getTypeToExpandTo(*DAG.getContext(), NVT))) { // Emit this X << 1 as X+X. SDVTList VTList = DAG.getVTList(NVT, MVT::Glue); SDValue LoOps[2] = { InL, InL }; Lo = DAG.getNode(ISD::ADDC, DL, VTList, LoOps); SDValue HiOps[3] = { InH, InH, Lo.getValue(1) }; Hi = DAG.getNode(ISD::ADDE, DL, VTList, HiOps); } else { Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, ShTy)); Hi = DAG.getNode(ISD::OR, DL, NVT, DAG.getNode(ISD::SHL, DL, NVT, InH, DAG.getConstant(Amt, ShTy)), DAG.getNode(ISD::SRL, DL, NVT, InL, DAG.getConstant(NVTBits-Amt, ShTy))); } return; } if (N->getOpcode() == ISD::SRL) { if (Amt > VTBits) { Lo = DAG.getConstant(0, NVT); Hi = DAG.getConstant(0, NVT); } else if (Amt > NVTBits) { Lo = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt-NVTBits,ShTy)); Hi = DAG.getConstant(0, NVT); } else if (Amt == NVTBits) { Lo = InH; Hi = DAG.getConstant(0, NVT); } else { Lo = DAG.getNode(ISD::OR, DL, NVT, DAG.getNode(ISD::SRL, DL, NVT, InL, DAG.getConstant(Amt, ShTy)), DAG.getNode(ISD::SHL, DL, NVT, InH, DAG.getConstant(NVTBits-Amt, ShTy))); Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, ShTy)); } return; } assert(N->getOpcode() == ISD::SRA && "Unknown shift!"); if (Amt > VTBits) { Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(NVTBits-1, ShTy)); } else if (Amt > NVTBits) { Lo = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt-NVTBits, ShTy)); Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(NVTBits-1, ShTy)); } else if (Amt == NVTBits) { Lo = InH; Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(NVTBits-1, ShTy)); } else { Lo = DAG.getNode(ISD::OR, DL, NVT, DAG.getNode(ISD::SRL, DL, NVT, InL, DAG.getConstant(Amt, ShTy)), DAG.getNode(ISD::SHL, DL, NVT, InH, DAG.getConstant(NVTBits-Amt, ShTy))); Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, ShTy)); } } /// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify /// this shift based on knowledge of the high bit of the shift amount. If we /// can tell this, we know that it is >= 32 or < 32, without knowing the actual /// shift amount. bool DAGTypeLegalizer:: ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Amt = N->getOperand(1); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); EVT ShTy = Amt.getValueType(); unsigned ShBits = ShTy.getScalarType().getSizeInBits(); unsigned NVTBits = NVT.getScalarType().getSizeInBits(); assert(isPowerOf2_32(NVTBits) && "Expanded integer type size not a power of two!"); SDLoc dl(N); APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits)); APInt KnownZero, KnownOne; DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne); // If we don't know anything about the high bits, exit. if (((KnownZero|KnownOne) & HighBitMask) == 0) return false; // Get the incoming operand to be shifted. SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); // If we know that any of the high bits of the shift amount are one, then we // can do this as a couple of simple shifts. if (KnownOne.intersects(HighBitMask)) { // Mask out the high bit, which we know is set. Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt, DAG.getConstant(~HighBitMask, ShTy)); switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift"); case ISD::SHL: Lo = DAG.getConstant(0, NVT); // Low part is zero. Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part. return true; case ISD::SRL: Hi = DAG.getConstant(0, NVT); // Hi part is zero. Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part. return true; case ISD::SRA: Hi = DAG.getNode(ISD::SRA, dl, NVT, InH, // Sign extend high part. DAG.getConstant(NVTBits-1, ShTy)); Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part. return true; } } // If we know that all of the high bits of the shift amount are zero, then we // can do this as a couple of simple shifts. if ((KnownZero & HighBitMask) == HighBitMask) { // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined // shift if x is zero. We can use XOR here because x is known to be smaller // than 32. SDValue Amt2 = DAG.getNode(ISD::XOR, dl, ShTy, Amt, DAG.getConstant(NVTBits-1, ShTy)); unsigned Op1, Op2; switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift"); case ISD::SHL: Op1 = ISD::SHL; Op2 = ISD::SRL; break; case ISD::SRL: case ISD::SRA: Op1 = ISD::SRL; Op2 = ISD::SHL; break; } // When shifting right the arithmetic for Lo and Hi is swapped. if (N->getOpcode() != ISD::SHL) std::swap(InL, InH); // Use a little trick to get the bits that move from Lo to Hi. First // shift by one bit. SDValue Sh1 = DAG.getNode(Op2, dl, NVT, InL, DAG.getConstant(1, ShTy)); // Then compute the remaining shift with amount-1. SDValue Sh2 = DAG.getNode(Op2, dl, NVT, Sh1, Amt2); Lo = DAG.getNode(N->getOpcode(), dl, NVT, InL, Amt); Hi = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(Op1, dl, NVT, InH, Amt),Sh2); if (N->getOpcode() != ISD::SHL) std::swap(Hi, Lo); return true; } return false; } /// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift /// of any size. bool DAGTypeLegalizer:: ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Amt = N->getOperand(1); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); EVT ShTy = Amt.getValueType(); unsigned NVTBits = NVT.getSizeInBits(); assert(isPowerOf2_32(NVTBits) && "Expanded integer type size not a power of two!"); SDLoc dl(N); // Get the incoming operand to be shifted. SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy); SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode); SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt); SDValue isShort = DAG.getSetCC(dl, getSetCCResultType(ShTy), Amt, NVBitsNode, ISD::SETULT); SDValue LoS, HiS, LoL, HiL; switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift"); case ISD::SHL: // Short: ShAmt < NVTBits LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); HiS = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(ISD::SHL, dl, NVT, InH, Amt), // FIXME: If Amt is zero, the following shift generates an undefined result // on some architectures. DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack)); // Long: ShAmt >= NVTBits LoL = DAG.getConstant(0, NVT); // Lo part is zero. HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part. Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL); Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL); return true; case ISD::SRL: // Short: ShAmt < NVTBits HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); LoS = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(ISD::SRL, dl, NVT, InL, Amt), // FIXME: If Amt is zero, the following shift generates an undefined result // on some architectures. DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack)); // Long: ShAmt >= NVTBits HiL = DAG.getConstant(0, NVT); // Hi part is zero. LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part. Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL); Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL); return true; case ISD::SRA: // Short: ShAmt < NVTBits HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); LoS = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(ISD::SRL, dl, NVT, InL, Amt), // FIXME: If Amt is zero, the following shift generates an undefined result // on some architectures. DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack)); // Long: ShAmt >= NVTBits HiL = DAG.getNode(ISD::SRA, dl, NVT, InH, // Sign of Hi part. DAG.getConstant(NVTBits-1, ShTy)); LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part. Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL); Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL); return true; } } void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); EVT NVT = LHSL.getValueType(); SDValue LoOps[2] = { LHSL, RHSL }; SDValue HiOps[3] = { LHSH, RHSH }; // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support // them. TODO: Teach operation legalization how to expand unsupported // ADDC/ADDE/SUBC/SUBE. The problem is that these operations generate // a carry of type MVT::Glue, but there doesn't seem to be any way to // generate a value of this type in the expanded code sequence. bool hasCarry = TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ? ISD::ADDC : ISD::SUBC, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); if (hasCarry) { SDVTList VTList = DAG.getVTList(NVT, MVT::Glue); if (N->getOpcode() == ISD::ADD) { Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps); } else { Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps); } return; } if (N->getOpcode() == ISD::ADD) { Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps); Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], ISD::SETULT); SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1, DAG.getConstant(1, NVT), DAG.getConstant(0, NVT)); SDValue Cmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[1], ISD::SETULT); SDValue Carry2 = DAG.getSelect(dl, NVT, Cmp2, DAG.getConstant(1, NVT), Carry1); Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2); } else { Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps); Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp = DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()), LoOps[0], LoOps[1], ISD::SETULT); SDValue Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, NVT), DAG.getConstant(0, NVT)); Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow); } } void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N, SDValue &Lo, SDValue &Hi) { // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; SDLoc dl(N); GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue); SDValue LoOps[2] = { LHSL, RHSL }; SDValue HiOps[3] = { LHSH, RHSH }; if (N->getOpcode() == ISD::ADDC) { Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps); } else { Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps); } // Legalized the flag result - switch anything that used the old flag to // use the new one. ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N, SDValue &Lo, SDValue &Hi) { // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; SDLoc dl(N); GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue); SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) }; SDValue HiOps[3] = { LHSH, RHSH }; Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps); // Legalized the flag result - switch anything that used the old flag to // use the new one. ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } void DAGTypeLegalizer::ExpandIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi) { SDValue Res = DisintegrateMERGE_VALUES(N, ResNo); SplitInteger(Res, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Op = N->getOperand(0); if (Op.getValueType().bitsLE(NVT)) { // The low part is any extension of the input (which degenerates to a copy). Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op); Hi = DAG.getUNDEF(NVT); // The high part is undefined. } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. assert(getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && "Operand over promoted?"); // Split the promoted operand. This will simplify when it is expanded. SplitInteger(Res, Lo, Hi); } } void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); EVT EVT = cast(N->getOperand(1))->getVT(); unsigned NVTBits = NVT.getSizeInBits(); unsigned EVTBits = EVT.getSizeInBits(); if (NVTBits < EVTBits) { Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), EVTBits - NVTBits))); } else { Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT)); // The high part replicates the sign bit of Lo, make it explicit. Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo, DAG.getConstant(NVTBits-1, TLI.getPointerTy())); } } void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); EVT EVT = cast(N->getOperand(1))->getVT(); unsigned NVTBits = NVT.getSizeInBits(); unsigned EVTBits = EVT.getSizeInBits(); if (NVTBits < EVTBits) { Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), EVTBits - NVTBits))); } else { Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT)); // The high part must be zero, make it explicit. Hi = DAG.getConstant(0, NVT); } } void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Hi, Lo); // Note swapped operands. Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo); Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi); } void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned NBitWidth = NVT.getSizeInBits(); auto Constant = cast(N); const APInt &Cst = Constant->getAPIntValue(); bool IsTarget = Constant->isTargetOpcode(); bool IsOpaque = Constant->isOpaque(); Lo = DAG.getConstant(Cst.trunc(NBitWidth), NVT, IsTarget, IsOpaque); Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), NVT, IsTarget, IsOpaque); } void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32) GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); SDValue HiNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi, DAG.getConstant(0, NVT), ISD::SETNE); SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo); SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi); Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ, DAG.getNode(ISD::ADD, dl, NVT, LoLZ, DAG.getConstant(NVT.getSizeInBits(), NVT))); Hi = DAG.getConstant(0, NVT); } void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo) GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo), DAG.getNode(ISD::CTPOP, dl, NVT, Hi)); Hi = DAG.getConstant(0, NVT); } void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32) GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, DAG.getConstant(0, NVT), ISD::SETNE); SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo); SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi); Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ, DAG.getNode(ISD::ADD, dl, NVT, HiLZ, DAG.getConstant(NVT.getSizeInBits(), NVT))); Hi = DAG.getConstant(0, NVT); } void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi) { if (ISD::isNormalLoad(N)) { ExpandRes_NormalLoad(N, Lo, Hi); return; } assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!"); EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); ISD::LoadExtType ExtType = N->getExtensionType(); unsigned Alignment = N->getAlignment(); bool isVolatile = N->isVolatile(); bool isNonTemporal = N->isNonTemporal(); bool isInvariant = N->isInvariant(); AAMDNodes AAInfo = N->getAAInfo(); SDLoc dl(N); assert(NVT.isByteSized() && "Expanded type not byte sized!"); if (N->getMemoryVT().bitsLE(NVT)) { EVT MemVT = N->getMemoryVT(); Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); // Remember the chain. Ch = Lo.getValue(1); if (ExtType == ISD::SEXTLOAD) { // The high part is obtained by SRA'ing all but one of the bits of the // lo part. unsigned LoSize = Lo.getValueType().getSizeInBits(); Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo, DAG.getConstant(LoSize-1, TLI.getPointerTy())); } else if (ExtType == ISD::ZEXTLOAD) { // The high part is just a zero. Hi = DAG.getConstant(0, NVT); } else { assert(ExtType == ISD::EXTLOAD && "Unknown extload!"); // The high part is undefined. Hi = DAG.getUNDEF(NVT); } } else if (TLI.isLittleEndian()) { // Little-endian - low bits are at low addresses. Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); unsigned ExcessBits = N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits); // Increment the pointer to the other half. unsigned IncrementSize = NVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT, isVolatile, isNonTemporal, isInvariant, MinAlign(Alignment, IncrementSize), AAInfo); // Build a factor node to remember that this load is independent of the // other one. Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); } else { // Big-endian - high bits are at low addresses. Favor aligned loads at // the cost of some bit-fiddling. EVT MemVT = N->getMemoryVT(); unsigned EBytes = MemVT.getStoreSize(); unsigned IncrementSize = NVT.getSizeInBits()/8; unsigned ExcessBits = (EBytes - IncrementSize)*8; // Load both the high bits and maybe some of the low bits. Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits() - ExcessBits), isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); // Load the rest of the low bits. Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), EVT::getIntegerVT(*DAG.getContext(), ExcessBits), isVolatile, isNonTemporal, isInvariant, MinAlign(Alignment, IncrementSize), AAInfo); // Build a factor node to remember that this load is independent of the // other one. Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); if (ExcessBits < NVT.getSizeInBits()) { // Transfer low bits from the bottom of Hi to the top of Lo. Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, DAG.getNode(ISD::SHL, dl, NVT, Hi, DAG.getConstant(ExcessBits, TLI.getPointerTy()))); // Move high bits to the right position in Hi. Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT, Hi, DAG.getConstant(NVT.getSizeInBits() - ExcessBits, TLI.getPointerTy())); } } // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Ch); } void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); SDValue LL, LH, RL, RH; GetExpandedInteger(N->getOperand(0), LL, LH); GetExpandedInteger(N->getOperand(1), RL, RH); Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL); Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH); } void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDLoc dl(N); SDValue LL, LH, RL, RH; GetExpandedInteger(N->getOperand(0), LL, LH); GetExpandedInteger(N->getOperand(1), RL, RH); if (TLI.expandMUL(N, Lo, Hi, NVT, DAG, LL, LH, RL, RH)) return; // If nothing else, we can make a libcall. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::MUL_I16; else if (VT == MVT::i32) LC = RTLIB::MUL_I32; else if (VT == MVT::i64) LC = RTLIB::MUL_I64; else if (VT == MVT::i128) LC = RTLIB::MUL_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!"); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node, SDValue &Lo, SDValue &Hi) { SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); SDLoc dl(Node); // Expand the result by simply replacing it with the equivalent // non-overflow-checking operation. SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB, dl, LHS.getValueType(), LHS, RHS); SplitInteger(Sum, Lo, Hi); // Compute the overflow. // // LHSSign -> LHS >= 0 // RHSSign -> RHS >= 0 // SumSign -> Sum >= 0 // // Add: // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) // Sub: // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) // EVT OType = Node->getValueType(1); SDValue Zero = DAG.getConstant(0, LHS.getValueType()); SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE); SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE); SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign, Node->getOpcode() == ISD::SADDO ? ISD::SETEQ : ISD::SETNE); SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE); SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE); SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(Node, 1), Cmp); } void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::SDIV_I16; else if (VT == MVT::i32) LC = RTLIB::SDIV_I32; else if (VT == MVT::i64) LC = RTLIB::SDIV_I64; else if (VT == MVT::i128) LC = RTLIB::SDIV_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!"); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); // If we can emit an efficient shift operation, do so now. Check to see if // the RHS is a constant. if (ConstantSDNode *CN = dyn_cast(N->getOperand(1))) return ExpandShiftByConstant(N, CN->getZExtValue(), Lo, Hi); // If we can determine that the high bit of the shift is zero or one, even if // the low bits are variable, emit this shift in an optimized form. if (ExpandShiftWithKnownAmountBit(N, Lo, Hi)) return; // If this target supports shift_PARTS, use it. First, map to the _PARTS opc. unsigned PartsOpc; if (N->getOpcode() == ISD::SHL) { PartsOpc = ISD::SHL_PARTS; } else if (N->getOpcode() == ISD::SRL) { PartsOpc = ISD::SRL_PARTS; } else { assert(N->getOpcode() == ISD::SRA && "Unknown shift!"); PartsOpc = ISD::SRA_PARTS; } // Next check to see if the target supports this SHL_PARTS operation or if it // will custom expand it. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT); if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) || Action == TargetLowering::Custom) { // Expand the subcomponents. SDValue LHSL, LHSH; GetExpandedInteger(N->getOperand(0), LHSL, LHSH); EVT VT = LHSL.getValueType(); // If the shift amount operand is coming from a vector legalization it may // have an illegal type. Fix that first by casting the operand, otherwise // the new SHL_PARTS operation would need further legalization. SDValue ShiftOp = N->getOperand(1); EVT ShiftTy = TLI.getShiftAmountTy(VT); assert(ShiftTy.getScalarType().getSizeInBits() >= Log2_32_Ceil(VT.getScalarType().getSizeInBits()) && "ShiftAmountTy is too small to cover the range of this type!"); if (ShiftOp.getValueType() != ShiftTy) ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy); SDValue Ops[] = { LHSL, LHSH, ShiftOp }; Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops); Hi = Lo.getValue(1); return; } // Otherwise, emit a libcall. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; bool isSigned; if (N->getOpcode() == ISD::SHL) { isSigned = false; /*sign irrelevant*/ if (VT == MVT::i16) LC = RTLIB::SHL_I16; else if (VT == MVT::i32) LC = RTLIB::SHL_I32; else if (VT == MVT::i64) LC = RTLIB::SHL_I64; else if (VT == MVT::i128) LC = RTLIB::SHL_I128; } else if (N->getOpcode() == ISD::SRL) { isSigned = false; if (VT == MVT::i16) LC = RTLIB::SRL_I16; else if (VT == MVT::i32) LC = RTLIB::SRL_I32; else if (VT == MVT::i64) LC = RTLIB::SRL_I64; else if (VT == MVT::i128) LC = RTLIB::SRL_I128; } else { assert(N->getOpcode() == ISD::SRA && "Unknown shift!"); isSigned = true; if (VT == MVT::i16) LC = RTLIB::SRA_I16; else if (VT == MVT::i32) LC = RTLIB::SRA_I32; else if (VT == MVT::i64) LC = RTLIB::SRA_I64; else if (VT == MVT::i128) LC = RTLIB::SRA_I128; } if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) { SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl).first, Lo, Hi); return; } if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi)) llvm_unreachable("Unsupported shift!"); } void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Op = N->getOperand(0); if (Op.getValueType().bitsLE(NVT)) { // The low part is sign extension of the input (degenerates to a copy). Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0)); // The high part is obtained by SRA'ing all but one of the bits of low part. unsigned LoSize = NVT.getSizeInBits(); Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo, DAG.getConstant(LoSize-1, TLI.getPointerTy())); } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. assert(getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && "Operand over promoted?"); // Split the promoted operand. This will simplify when it is expanded. SplitInteger(Res, Lo, Hi); unsigned ExcessBits = Op.getValueType().getSizeInBits() - NVT.getSizeInBits(); Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), ExcessBits))); } } void DAGTypeLegalizer:: ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT EVT = cast(N->getOperand(1))->getVT(); if (EVT.bitsLE(Lo.getValueType())) { // sext_inreg the low part if needed. Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo, N->getOperand(1)); // The high part gets the sign extension from the lo-part. This handles // things like sextinreg V:i64 from i8. Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo, DAG.getConstant(Hi.getValueType().getSizeInBits()-1, TLI.getPointerTy())); } else { // For example, extension of an i48 to an i64. Leave the low part alone, // sext_inreg the high part. unsigned ExcessBits = EVT.getSizeInBits() - Lo.getValueType().getSizeInBits(); Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), ExcessBits))); } } void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::SREM_I16; else if (VT == MVT::i32) LC = RTLIB::SREM_I32; else if (VT == MVT::i64) LC = RTLIB::SREM_I64; else if (VT == MVT::i128) LC = RTLIB::SREM_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!"); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0)); Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(), N->getOperand(0), DAG.getConstant(NVT.getSizeInBits(), TLI.getPointerTy())); Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi); } void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDLoc dl(N); // Expand the result by simply replacing it with the equivalent // non-overflow-checking operation. SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB, dl, LHS.getValueType(), LHS, RHS); SplitInteger(Sum, Lo, Hi); // Calculate the overflow: addition overflows iff a + b < a, and subtraction // overflows iff a - b > a. SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, N->getOpcode () == ISD::UADDO ? ISD::SETULT : ISD::SETUGT); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); } void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); // A divide for UMULO should be faster than a function call. if (N->getOpcode() == ISD::UMULO) { SDValue LHS = N->getOperand(0), RHS = N->getOperand(1); SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS); SplitInteger(MUL, Lo, Hi); // A divide for UMULO will be faster than a function call. Select to // make sure we aren't using 0. SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT), RHS, DAG.getConstant(0, VT), ISD::SETEQ); SDValue NotZero = DAG.getSelect(dl, VT, isZero, DAG.getConstant(1, VT), RHS); SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero); SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS, ISD::SETNE); Overflow = DAG.getSelect(dl, N->getValueType(1), isZero, DAG.getConstant(0, N->getValueType(1)), Overflow); ReplaceValueWith(SDValue(N, 1), Overflow); return; } Type *RetTy = VT.getTypeForEVT(*DAG.getContext()); EVT PtrVT = TLI.getPointerTy(); Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext()); // Replace this with a libcall that will check overflow. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i32) LC = RTLIB::MULO_I32; else if (VT == MVT::i64) LC = RTLIB::MULO_I64; else if (VT == MVT::i128) LC = RTLIB::MULO_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XMULO!"); SDValue Temp = DAG.CreateStackTemporary(PtrVT); // Temporary for the overflow value, default it to zero. SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, DAG.getConstant(0, PtrVT), Temp, MachinePointerInfo(), false, false, 0); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { EVT ArgVT = N->getOperand(i).getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = true; Entry.isZExt = false; Args.push_back(Entry); } // Also pass the address of the overflow check. Entry.Node = Temp; Entry.Ty = PtrTy->getPointerTo(); Entry.isSExt = true; Entry.isZExt = false; Args.push_back(Entry); SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args), 0) .setSExtResult(); std::pair CallInfo = TLI.LowerCallTo(CLI); SplitInteger(CallInfo.first, Lo, Hi); SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, MachinePointerInfo(), false, false, false, 0); SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2, DAG.getConstant(0, PtrVT), ISD::SETNE); // Use the overflow from the libcall everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); } void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UDIV_I16; else if (VT == MVT::i32) LC = RTLIB::UDIV_I32; else if (VT == MVT::i64) LC = RTLIB::UDIV_I64; else if (VT == MVT::i128) LC = RTLIB::UDIV_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!"); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UREM_I16; else if (VT == MVT::i32) LC = RTLIB::UREM_I32; else if (VT == MVT::i64) LC = RTLIB::UREM_I64; else if (VT == MVT::i128) LC = RTLIB::UREM_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!"); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Op = N->getOperand(0); if (Op.getValueType().bitsLE(NVT)) { // The low part is zero extension of the input (degenerates to a copy). Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0)); Hi = DAG.getConstant(0, NVT); // The high part is just a zero. } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. assert(getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && "Operand over promoted?"); // Split the promoted operand. This will simplify when it is expanded. SplitInteger(Res, Lo, Hi); unsigned ExcessBits = Op.getValueType().getSizeInBits() - NVT.getSizeInBits(); Hi = DAG.getZeroExtendInReg(Hi, dl, EVT::getIntegerVT(*DAG.getContext(), ExcessBits)); } } void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT VT = cast(N)->getMemoryVT(); SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); SDValue Zero = DAG.getConstant(0, VT); SDValue Swap = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, cast(N)->getMemoryVT(), VTs, N->getOperand(0), N->getOperand(1), Zero, Zero, cast(N)->getMemOperand(), cast(N)->getOrdering(), cast(N)->getOrdering(), cast(N)->getSynchScope()); ReplaceValueWith(SDValue(N, 0), Swap.getValue(0)); ReplaceValueWith(SDValue(N, 1), Swap.getValue(2)); } //===----------------------------------------------------------------------===// // Integer Operand Expansion //===----------------------------------------------------------------------===// /// ExpandIntegerOperand - This method is called when the specified operand of /// the specified node is found to need expansion. At this point, all of the /// result types of the node are known to be legal, but other operands of the /// node may need promotion or expansion as well as the specified one. bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) return false; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to expand this operator's operand!"); case ISD::BITCAST: Res = ExpandOp_BITCAST(N); break; case ISD::BR_CC: Res = ExpandIntOp_BR_CC(N); break; case ISD::BUILD_VECTOR: Res = ExpandOp_BUILD_VECTOR(N); break; case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break; case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break; case ISD::SCALAR_TO_VECTOR: Res = ExpandOp_SCALAR_TO_VECTOR(N); break; case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; case ISD::SINT_TO_FP: Res = ExpandIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = ExpandIntOp_STORE(cast(N), OpNo); break; case ISD::TRUNCATE: Res = ExpandIntOp_TRUNCATE(N); break; case ISD::UINT_TO_FP: Res = ExpandIntOp_UINT_TO_FP(N); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = ExpandIntOp_Shift(N); break; case ISD::RETURNADDR: case ISD::FRAMEADDR: Res = ExpandIntOp_RETURNADDR(N); break; case ISD::ATOMIC_STORE: Res = ExpandIntOp_ATOMIC_STORE(N); break; } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } /// IntegerExpandSetCCOperands - Expand the operands of a comparison. This code /// is shared among BR_CC, SELECT_CC, and SETCC handlers. void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, SDLoc dl) { SDValue LHSLo, LHSHi, RHSLo, RHSHi; GetExpandedInteger(NewLHS, LHSLo, LHSHi); GetExpandedInteger(NewRHS, RHSLo, RHSHi); if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) { if (RHSLo == RHSHi) { if (ConstantSDNode *RHSCST = dyn_cast(RHSLo)) { if (RHSCST->isAllOnesValue()) { // Equality comparison to -1. NewLHS = DAG.getNode(ISD::AND, dl, LHSLo.getValueType(), LHSLo, LHSHi); NewRHS = RHSLo; return; } } } NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo); NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi); NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS); NewRHS = DAG.getConstant(0, NewLHS.getValueType()); return; } // If this is a comparison of the sign bit, just look at the top part. // X > -1, x < 0 if (ConstantSDNode *CST = dyn_cast(NewRHS)) if ((CCCode == ISD::SETLT && CST->isNullValue()) || // X < 0 (CCCode == ISD::SETGT && CST->isAllOnesValue())) { // X > -1 NewLHS = LHSHi; NewRHS = RHSHi; return; } // FIXME: This generated code sucks. ISD::CondCode LowCC; switch (CCCode) { default: llvm_unreachable("Unknown integer setcc!"); case ISD::SETLT: case ISD::SETULT: LowCC = ISD::SETULT; break; case ISD::SETGT: case ISD::SETUGT: LowCC = ISD::SETUGT; break; case ISD::SETLE: case ISD::SETULE: LowCC = ISD::SETULE; break; case ISD::SETGE: case ISD::SETUGE: LowCC = ISD::SETUGE; break; } // Tmp1 = lo(op1) < lo(op2) // Always unsigned comparison // Tmp2 = hi(op1) < hi(op2) // Signedness depends on operands // dest = hi(op1) == hi(op2) ? Tmp1 : Tmp2; // NOTE: on targets without efficient SELECT of bools, we can always use // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3) TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true, nullptr); SDValue Tmp1, Tmp2; if (TLI.isTypeLegal(LHSLo.getValueType()) && TLI.isTypeLegal(RHSLo.getValueType())) Tmp1 = TLI.SimplifySetCC(getSetCCResultType(LHSLo.getValueType()), LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl); if (!Tmp1.getNode()) Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo, RHSLo, LowCC); if (TLI.isTypeLegal(LHSHi.getValueType()) && TLI.isTypeLegal(RHSHi.getValueType())) Tmp2 = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl); if (!Tmp2.getNode()) Tmp2 = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, DAG.getCondCode(CCCode)); ConstantSDNode *Tmp1C = dyn_cast(Tmp1.getNode()); ConstantSDNode *Tmp2C = dyn_cast(Tmp2.getNode()); if ((Tmp1C && Tmp1C->isNullValue()) || (Tmp2C && Tmp2C->isNullValue() && (CCCode == ISD::SETLE || CCCode == ISD::SETGE || CCCode == ISD::SETUGE || CCCode == ISD::SETULE)) || (Tmp2C && Tmp2C->getAPIntValue() == 1 && (CCCode == ISD::SETLT || CCCode == ISD::SETGT || CCCode == ISD::SETUGT || CCCode == ISD::SETULT))) { // low part is known false, returns high part. // For LE / GE, if high part is known false, ignore the low part. // For LT / GT, if high part is known true, ignore the low part. NewLHS = Tmp2; NewRHS = SDValue(); return; } NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, ISD::SETEQ, false, DagCombineInfo, dl); if (!NewLHS.getNode()) NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, ISD::SETEQ); NewLHS = DAG.getSelect(dl, Tmp1.getValueType(), NewLHS, Tmp1, Tmp2); NewRHS = SDValue(); } SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) { SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3); ISD::CondCode CCCode = cast(N->getOperand(1))->get(); IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N)); // If ExpandSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!NewRHS.getNode()) { NewRHS = DAG.getConstant(0, NewLHS.getValueType()); CCCode = ISD::SETNE; } // Update N to have the operands specified. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), DAG.getCondCode(CCCode), NewLHS, NewRHS, N->getOperand(4)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) { SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1); ISD::CondCode CCCode = cast(N->getOperand(4))->get(); IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N)); // If ExpandSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!NewRHS.getNode()) { NewRHS = DAG.getConstant(0, NewLHS.getValueType()); CCCode = ISD::SETNE; } // Update N to have the operands specified. return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, N->getOperand(2), N->getOperand(3), DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) { SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1); ISD::CondCode CCCode = cast(N->getOperand(2))->get(); IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N)); // If ExpandSetCCOperands returned a scalar, use it. if (!NewRHS.getNode()) { assert(NewLHS.getValueType() == N->getValueType(0) && "Unexpected setcc expansion!"); return NewLHS; } // Otherwise, update N to have the operands specified. return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) { // The value being shifted is legal, but the shift amount is too big. // It follows that either the result of the shift is undefined, or the // upper half of the shift amount is zero. Just use the lower half. SDValue Lo, Hi; GetExpandedInteger(N->getOperand(1), Lo, Hi); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) { // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant. This // surely makes pretty nice problems on 8/16 bit targets. Just truncate this // constant to valid type. SDValue Lo, Hi; GetExpandedInteger(N->getOperand(0), Lo, Hi); return SDValue(DAG.UpdateNodeOperands(N, Lo), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) { SDValue Op = N->getOperand(0); EVT DstVT = N->getValueType(0); RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this SINT_TO_FP!"); return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first; } SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { if (ISD::isNormalStore(N)) return ExpandOp_NormalStore(N, OpNo); assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!"); assert(OpNo == 1 && "Can only expand the stored value so far"); EVT VT = N->getOperand(1).getValueType(); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); unsigned Alignment = N->getAlignment(); bool isVolatile = N->isVolatile(); bool isNonTemporal = N->isNonTemporal(); AAMDNodes AAInfo = N->getAAInfo(); SDLoc dl(N); SDValue Lo, Hi; assert(NVT.isByteSized() && "Expanded type not byte sized!"); if (N->getMemoryVT().bitsLE(NVT)) { GetExpandedInteger(N->getValue(), Lo, Hi); return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), N->getMemoryVT(), isVolatile, isNonTemporal, Alignment, AAInfo); } if (TLI.isLittleEndian()) { // Little-endian - low bits are at low addresses. GetExpandedInteger(N->getValue(), Lo, Hi); Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), isVolatile, isNonTemporal, Alignment, AAInfo); unsigned ExcessBits = N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits); // Increment the pointer to the other half. unsigned IncrementSize = NVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT, isVolatile, isNonTemporal, MinAlign(Alignment, IncrementSize), AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } // Big-endian - high bits are at low addresses. Favor aligned stores at // the cost of some bit-fiddling. GetExpandedInteger(N->getValue(), Lo, Hi); EVT ExtVT = N->getMemoryVT(); unsigned EBytes = ExtVT.getStoreSize(); unsigned IncrementSize = NVT.getSizeInBits()/8; unsigned ExcessBits = (EBytes - IncrementSize)*8; EVT HiVT = EVT::getIntegerVT(*DAG.getContext(), ExtVT.getSizeInBits() - ExcessBits); if (ExcessBits < NVT.getSizeInBits()) { // Transfer high bits from the top of Lo to the bottom of Hi. Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi, DAG.getConstant(NVT.getSizeInBits() - ExcessBits, TLI.getPointerTy())); Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, DAG.getNode(ISD::SRL, dl, NVT, Lo, DAG.getConstant(ExcessBits, TLI.getPointerTy()))); } // Store both the high bits and maybe some of the low bits. Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, isVolatile, isNonTemporal, Alignment, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); // Store the lowest ExcessBits bits in the second half. Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), EVT::getIntegerVT(*DAG.getContext(), ExcessBits), isVolatile, isNonTemporal, MinAlign(Alignment, IncrementSize), AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) { SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); // Just truncate the low part of the source. return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), InL); } SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) { SDValue Op = N->getOperand(0); EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); SDLoc dl(N); // The following optimization is valid only if every value in SrcVT (when // treated as signed) is representable in DstVT. Check that the mantissa // size of DstVT is >= than the number of bits in SrcVT -1. const fltSemantics &sem = DAG.EVTToAPFloatSemantics(DstVT); if (APFloat::semanticsPrecision(sem) >= SrcVT.getSizeInBits()-1 && TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){ // Do a signed conversion then adjust the result. SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op); SignedConv = TLI.LowerOperation(SignedConv, DAG); // The result of the signed conversion needs adjusting if the 'sign bit' of // the incoming integer was set. To handle this, we dynamically test to see // if it is set, and, if so, add a fudge factor. const uint64_t F32TwoE32 = 0x4F800000ULL; const uint64_t F32TwoE64 = 0x5F800000ULL; const uint64_t F32TwoE128 = 0x7F800000ULL; APInt FF(32, 0); if (SrcVT == MVT::i32) FF = APInt(32, F32TwoE32); else if (SrcVT == MVT::i64) FF = APInt(32, F32TwoE64); else if (SrcVT == MVT::i128) FF = APInt(32, F32TwoE128); else llvm_unreachable("Unsupported UINT_TO_FP!"); // Check whether the sign bit is set. SDValue Lo, Hi; GetExpandedInteger(Op, Lo, Hi); SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Hi.getValueType()), Hi, DAG.getConstant(0, Hi.getValueType()), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF.zext(64)), TLI.getPointerTy()); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0); SDValue Four = DAG.getIntPtrConstant(4); if (TLI.isBigEndian()) std::swap(Zero, Four); SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); unsigned Alignment = cast(FudgePtr)->getAlignment(); FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(), FudgePtr, Offset); Alignment = std::min(Alignment, 4u); // Load the value out, extending it from f32 to the destination float type. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(), MVT::f32, false, false, false, Alignment); return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge); } // Otherwise, use a libcall. RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this UINT_TO_FP!"); return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first; } SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) { SDLoc dl(N); SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, cast(N)->getMemoryVT(), N->getOperand(0), N->getOperand(1), N->getOperand(2), cast(N)->getMemOperand(), cast(N)->getOrdering(), cast(N)->getSynchScope()); return Swap.getValue(1); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { SDValue InOp0 = N->getOperand(0); EVT InVT = InOp0.getValueType(); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); unsigned OutNumElems = OutVT.getVectorNumElements(); EVT NOutVTElem = NOutVT.getVectorElementType(); SDLoc dl(N); SDValue BaseIdx = N->getOperand(1); SmallVector Ops; Ops.reserve(OutNumElems); for (unsigned i = 0; i != OutNumElems; ++i) { // Extract the element from the original vector. SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(), BaseIdx, DAG.getConstant(i, BaseIdx.getValueType())); SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InVT.getVectorElementType(), N->getOperand(0), Index); SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext); // Insert the converted element to the new vector. Ops.push_back(Op); } return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) { ShuffleVectorSDNode *SV = cast(N); EVT VT = N->getValueType(0); SDLoc dl(N); unsigned NumElts = VT.getVectorNumElements(); SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { NewMask.push_back(SV->getMaskElt(i)); } SDValue V0 = GetPromotedInteger(N->getOperand(0)); SDValue V1 = GetPromotedInteger(N->getOperand(1)); EVT OutVT = V0.getValueType(); return DAG.getVectorShuffle(OutVT, dl, V0, V1, &NewMask[0]); } SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); unsigned NumElems = N->getNumOperands(); EVT NOutVTElem = NOutVT.getVectorElementType(); SDLoc dl(N); SmallVector Ops; Ops.reserve(NumElems); for (unsigned i = 0; i != NumElems; ++i) { SDValue Op; // BUILD_VECTOR integer operand types are allowed to be larger than the // result's element type. This may still be true after the promotion. For // example, we might be promoting ( = BV , , ...) to // (v?i16 = BV , , ...), and we can't any_extend to . if (N->getOperand(i).getValueType().bitsLT(NOutVTElem)) Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i)); else Op = N->getOperand(i); Ops.push_back(Op); } return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) { SDLoc dl(N); assert(!N->getOperand(0).getValueType().isVector() && "Input must be a scalar"); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); EVT NOutVTElem = NOutVT.getVectorElementType(); SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0)); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op); } SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) { SDLoc dl(N); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); EVT InElemTy = OutVT.getVectorElementType(); EVT OutElemTy = NOutVT.getVectorElementType(); unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements(); unsigned NumOutElem = NOutVT.getVectorNumElements(); unsigned NumOperands = N->getNumOperands(); assert(NumElem * NumOperands == NumOutElem && "Unexpected number of elements"); // Take the elements from the first vector. SmallVector Ops(NumOutElem); for (unsigned i = 0; i < NumOperands; ++i) { SDValue Op = N->getOperand(i); for (unsigned j = 0; j < NumElem; ++j) { SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InElemTy, Op, DAG.getConstant(j, TLI.getVectorIdxTy())); Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext); } } return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); EVT NOutVTElem = NOutVT.getVectorElementType(); SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(1)); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT, V0, ConvElem, N->getOperand(2)); } SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl, TLI.getVectorIdxTy()); SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, V0->getValueType(0).getScalarType(), V0, V1); // EXTRACT_VECTOR_ELT can return types which are wider than the incoming // element types. If this is the case then we need to expand the outgoing // value and not truncate it. return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0)); } SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { SDLoc dl(N); unsigned NumElems = N->getNumOperands(); EVT RetSclrTy = N->getValueType(0).getVectorElementType(); SmallVector NewOps; NewOps.reserve(NumElems); // For each incoming vector for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) { SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx)); EVT SclrTy = Incoming->getValueType(0).getVectorElementType(); unsigned NumElem = Incoming->getValueType(0).getVectorNumElements(); for (unsigned i=0; igetValueType(0), NewOps); } Index: projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (revision 279025) @@ -1,774 +1,775 @@ //===-- LegalizeTypes.h - DAG Type Legalizer class definition ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the DAGTypeLegalizer class. This is a private interface // shared between the code that implements the SelectionDAG::LegalizeTypes // method. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H #define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLowering.h" namespace llvm { //===----------------------------------------------------------------------===// /// DAGTypeLegalizer - This takes an arbitrary SelectionDAG as input and hacks /// on it until only value types the target machine can handle are left. This /// involves promoting small sizes to large sizes or splitting up large values /// into small values. /// class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { const TargetLowering &TLI; SelectionDAG &DAG; public: // NodeIdFlags - This pass uses the NodeId on the SDNodes to hold information // about the state of the node. The enum has all the values. enum NodeIdFlags { /// ReadyToProcess - All operands have been processed, so this node is ready /// to be handled. ReadyToProcess = 0, /// NewNode - This is a new node, not before seen, that was created in the /// process of legalizing some other node. NewNode = -1, /// Unanalyzed - This node's ID needs to be set to the number of its /// unprocessed operands. Unanalyzed = -2, /// Processed - This is a node that has already been processed. Processed = -3 // 1+ - This is a node which has this many unprocessed operands. }; private: /// ValueTypeActions - This is a bitvector that contains two bits for each /// simple value type, where the two bits correspond to the LegalizeAction /// enum from TargetLowering. This can be queried with "getTypeAction(VT)". TargetLowering::ValueTypeActionImpl ValueTypeActions; /// getTypeAction - Return how we should legalize values of this type. TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const { return TLI.getTypeAction(*DAG.getContext(), VT); } /// isTypeLegal - Return true if this type is legal on this target. bool isTypeLegal(EVT VT) const { return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal; } EVT getSetCCResultType(EVT VT) const { return TLI.getSetCCResultType(*DAG.getContext(), VT); } /// IgnoreNodeResults - Pretend all of this node's results are legal. bool IgnoreNodeResults(SDNode *N) const { return N->getOpcode() == ISD::TargetConstant; } /// PromotedIntegers - For integer nodes that are below legal width, this map /// indicates what promoted value to use. SmallDenseMap PromotedIntegers; /// ExpandedIntegers - For integer nodes that need to be expanded this map /// indicates which operands are the expanded version of the input. SmallDenseMap, 8> ExpandedIntegers; /// SoftenedFloats - For floating point nodes converted to integers of /// the same size, this map indicates the converted value to use. SmallDenseMap SoftenedFloats; /// ExpandedFloats - For float nodes that need to be expanded this map /// indicates which operands are the expanded version of the input. SmallDenseMap, 8> ExpandedFloats; /// ScalarizedVectors - For nodes that are <1 x ty>, this map indicates the /// scalar value of type 'ty' to use. SmallDenseMap ScalarizedVectors; /// SplitVectors - For nodes that need to be split this map indicates /// which operands are the expanded version of the input. SmallDenseMap, 8> SplitVectors; /// WidenedVectors - For vector nodes that need to be widened, indicates /// the widened value to use. SmallDenseMap WidenedVectors; /// ReplacedValues - For values that have been replaced with another, /// indicates the replacement value to use. SmallDenseMap ReplacedValues; /// Worklist - This defines a worklist of nodes to process. In order to be /// pushed onto this worklist, all operands of a node must have already been /// processed. SmallVector Worklist; public: explicit DAGTypeLegalizer(SelectionDAG &dag) : TLI(dag.getTargetLoweringInfo()), DAG(dag), ValueTypeActions(TLI.getValueTypeActions()) { static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE, "Too many value types for ValueTypeActions to hold!"); } /// run - This is the main entry point for the type legalizer. This does a /// top-down traversal of the dag, legalizing types as it goes. Returns /// "true" if it made any changes. bool run(); void NoteDeletion(SDNode *Old, SDNode *New) { ExpungeNode(Old); ExpungeNode(New); for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) ReplacedValues[SDValue(Old, i)] = SDValue(New, i); } SelectionDAG &getDAG() const { return DAG; } private: SDNode *AnalyzeNewNode(SDNode *N); void AnalyzeNewValue(SDValue &Val); void ExpungeNode(SDNode *N); void PerformExpensiveChecks(); void RemapValue(SDValue &N); // Common routines. SDValue BitConvertToInteger(SDValue Op); SDValue BitConvertVectorToIntegerVector(SDValue Op); SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT); bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult); bool CustomWidenLowerNode(SDNode *N, EVT VT); /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES /// node with the corresponding input operand, except for the result 'ResNo', /// for which the corresponding input operand is returned. SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo); SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index); SDValue JoinIntegers(SDValue Lo, SDValue Hi); SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned); std::pair ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); std::pair ExpandAtomic(SDNode *Node); SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT); void ReplaceValueWith(SDValue From, SDValue To); void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi); void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, SDValue &Lo, SDValue &Hi); //===--------------------------------------------------------------------===// // Integer Promotion Support: LegalizeIntegerTypes.cpp //===--------------------------------------------------------------------===// /// GetPromotedInteger - Given a processed operand Op which was promoted to a /// larger integer type, this returns the promoted value. The low bits of the /// promoted value corresponding to the original type are exactly equal to Op. /// The extra bits contain rubbish, so the promoted value may need to be zero- /// or sign-extended from the original type before it is usable (the helpers /// SExtPromotedInteger and ZExtPromotedInteger can do this for you). /// For example, if Op is an i16 and was promoted to an i32, then this method /// returns an i32, the lower 16 bits of which coincide with Op, and the upper /// 16 bits of which contain rubbish. SDValue GetPromotedInteger(SDValue Op) { SDValue &PromotedOp = PromotedIntegers[Op]; RemapValue(PromotedOp); assert(PromotedOp.getNode() && "Operand wasn't promoted?"); return PromotedOp; } void SetPromotedInteger(SDValue Op, SDValue Result); /// SExtPromotedInteger - Get a promoted operand and sign extend it to the /// final size. SDValue SExtPromotedInteger(SDValue Op) { EVT OldVT = Op.getValueType(); SDLoc dl(Op); Op = GetPromotedInteger(Op); return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op, DAG.getValueType(OldVT)); } /// ZExtPromotedInteger - Get a promoted operand and zero extend it to the /// final size. SDValue ZExtPromotedInteger(SDValue Op) { EVT OldVT = Op.getValueType(); SDLoc dl(Op); Op = GetPromotedInteger(Op); return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType()); } // Integer Result Promotion. void PromoteIntegerResult(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_AssertSext(SDNode *N); SDValue PromoteIntRes_AssertZext(SDNode *N); SDValue PromoteIntRes_Atomic0(AtomicSDNode *N); SDValue PromoteIntRes_Atomic1(AtomicSDNode *N); SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo); SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N); SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N); SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntRes_BITCAST(SDNode *N); SDValue PromoteIntRes_BSWAP(SDNode *N); SDValue PromoteIntRes_BUILD_PAIR(SDNode *N); SDValue PromoteIntRes_Constant(SDNode *N); SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N); SDValue PromoteIntRes_CTLZ(SDNode *N); SDValue PromoteIntRes_CTPOP(SDNode *N); SDValue PromoteIntRes_CTTZ(SDNode *N); SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_FP_TO_XINT(SDNode *N); SDValue PromoteIntRes_FP_TO_FP16(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_SDIV(SDNode *N); SDValue PromoteIntRes_SELECT(SDNode *N); SDValue PromoteIntRes_VSELECT(SDNode *N); SDValue PromoteIntRes_SELECT_CC(SDNode *N); SDValue PromoteIntRes_SETCC(SDNode *N); SDValue PromoteIntRes_SHL(SDNode *N); SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N); SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N); SDValue PromoteIntRes_SRA(SDNode *N); SDValue PromoteIntRes_SRL(SDNode *N); SDValue PromoteIntRes_TRUNCATE(SDNode *N); SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_UDIV(SDNode *N); SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo); SDValue PromoteIntOp_ANY_EXTEND(SDNode *N); SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N); SDValue PromoteIntOp_BITCAST(SDNode *N); SDValue PromoteIntOp_BUILD_PAIR(SDNode *N); SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N); SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N); SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N); SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N); SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VSETCC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_Shift(SDNode *N); SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N); SDValue PromoteIntOp_SINT_TO_FP(SDNode *N); SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_TRUNCATE(SDNode *N); SDValue PromoteIntOp_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); //===--------------------------------------------------------------------===// // Integer Expansion Support: LegalizeIntegerTypes.cpp //===--------------------------------------------------------------------===// /// GetExpandedInteger - Given a processed operand Op which was expanded into /// two integers of half the size, this returns the two halves. The low bits /// of Op are exactly equal to the bits of Lo; the high bits exactly equal Hi. /// For example, if Op is an i64 which was expanded into two i32's, then this /// method returns the two i32's, with Lo being equal to the lower 32 bits of /// Op, and Hi being equal to the upper 32 bits. void GetExpandedInteger(SDValue Op, SDValue &Lo, SDValue &Hi); void SetExpandedInteger(SDValue Op, SDValue Lo, SDValue Hi); // Integer Result Expansion. void ExpandIntegerResult(SDNode *N, unsigned ResNo); void ExpandIntRes_MERGE_VALUES (SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ANY_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_AssertSext (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_AssertZext (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_Constant (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_CTLZ (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_CTPOP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_CTTZ (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_LOAD (LoadSDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SIGN_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_TRUNCATE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ZERO_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_FP_TO_SINT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_FP_TO_UINT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_Logical (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUB (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BSWAP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_MUL (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SDIV (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SREM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_UDIV (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_UREM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_Shift (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_UADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_XMULO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandShiftByConstant(SDNode *N, unsigned Amt, SDValue &Lo, SDValue &Hi); bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi); bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi); // Integer Operand Expansion. bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo); SDValue ExpandIntOp_BITCAST(SDNode *N); SDValue ExpandIntOp_BR_CC(SDNode *N); SDValue ExpandIntOp_BUILD_VECTOR(SDNode *N); SDValue ExpandIntOp_EXTRACT_ELEMENT(SDNode *N); SDValue ExpandIntOp_SELECT_CC(SDNode *N); SDValue ExpandIntOp_SETCC(SDNode *N); SDValue ExpandIntOp_Shift(SDNode *N); SDValue ExpandIntOp_SINT_TO_FP(SDNode *N); SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue ExpandIntOp_TRUNCATE(SDNode *N); SDValue ExpandIntOp_UINT_TO_FP(SDNode *N); SDValue ExpandIntOp_RETURNADDR(SDNode *N); SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N); void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, SDLoc dl); //===--------------------------------------------------------------------===// // Float to Integer Conversion Support: LegalizeFloatTypes.cpp //===--------------------------------------------------------------------===// /// GetSoftenedFloat - Given a processed operand Op which was converted to an /// integer of the same size, this returns the integer. The integer contains /// exactly the same bits as Op - only the type changed. For example, if Op /// is an f32 which was softened to an i32, then this method returns an i32, /// the bits of which coincide with those of Op. SDValue GetSoftenedFloat(SDValue Op) { SDValue &SoftenedOp = SoftenedFloats[Op]; RemapValue(SoftenedOp); assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?"); return SoftenedOp; } void SetSoftenedFloat(SDValue Op, SDValue Result); // Result Float to Integer Conversion. void SoftenFloatResult(SDNode *N, unsigned OpNo); SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_BITCAST(SDNode *N); SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N); SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N); SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SoftenFloatRes_FABS(SDNode *N); SDValue SoftenFloatRes_FMINNUM(SDNode *N); SDValue SoftenFloatRes_FMAXNUM(SDNode *N); SDValue SoftenFloatRes_FADD(SDNode *N); SDValue SoftenFloatRes_FCEIL(SDNode *N); SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N); SDValue SoftenFloatRes_FCOS(SDNode *N); SDValue SoftenFloatRes_FDIV(SDNode *N); SDValue SoftenFloatRes_FEXP(SDNode *N); SDValue SoftenFloatRes_FEXP2(SDNode *N); SDValue SoftenFloatRes_FFLOOR(SDNode *N); SDValue SoftenFloatRes_FLOG(SDNode *N); SDValue SoftenFloatRes_FLOG2(SDNode *N); SDValue SoftenFloatRes_FLOG10(SDNode *N); SDValue SoftenFloatRes_FMA(SDNode *N); SDValue SoftenFloatRes_FMUL(SDNode *N); SDValue SoftenFloatRes_FNEARBYINT(SDNode *N); SDValue SoftenFloatRes_FNEG(SDNode *N); SDValue SoftenFloatRes_FP_EXTEND(SDNode *N); SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N); SDValue SoftenFloatRes_FP_ROUND(SDNode *N); SDValue SoftenFloatRes_FPOW(SDNode *N); SDValue SoftenFloatRes_FPOWI(SDNode *N); SDValue SoftenFloatRes_FREM(SDNode *N); SDValue SoftenFloatRes_FRINT(SDNode *N); SDValue SoftenFloatRes_FROUND(SDNode *N); SDValue SoftenFloatRes_FSIN(SDNode *N); SDValue SoftenFloatRes_FSQRT(SDNode *N); SDValue SoftenFloatRes_FSUB(SDNode *N); SDValue SoftenFloatRes_FTRUNC(SDNode *N); SDValue SoftenFloatRes_LOAD(SDNode *N); SDValue SoftenFloatRes_SELECT(SDNode *N); SDValue SoftenFloatRes_SELECT_CC(SDNode *N); SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); // Operand Float to Integer Conversion. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_BITCAST(SDNode *N); SDValue SoftenFloatOp_BR_CC(SDNode *N); SDValue SoftenFloatOp_FP_EXTEND(SDNode *N); SDValue SoftenFloatOp_FP_ROUND(SDNode *N); SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N); SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N); SDValue SoftenFloatOp_SELECT_CC(SDNode *N); SDValue SoftenFloatOp_SETCC(SDNode *N); SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo); //===--------------------------------------------------------------------===// // Float Expansion Support: LegalizeFloatTypes.cpp //===--------------------------------------------------------------------===// /// GetExpandedFloat - Given a processed operand Op which was expanded into /// two floating point values of half the size, this returns the two halves. /// The low bits of Op are exactly equal to the bits of Lo; the high bits /// exactly equal Hi. For example, if Op is a ppcf128 which was expanded /// into two f64's, then this method returns the two f64's, with Lo being /// equal to the lower 64 bits of Op, and Hi to the upper 64 bits. void GetExpandedFloat(SDValue Op, SDValue &Lo, SDValue &Hi); void SetExpandedFloat(SDValue Op, SDValue Lo, SDValue Hi); // Float Result Expansion. void ExpandFloatResult(SDNode *N, unsigned ResNo); void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FABS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FADD (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FCEIL (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FCOS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FDIV (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FEXP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FEXP2 (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FFLOOR (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FLOG (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FLOG2 (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FLOG10 (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMA (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMUL (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FNEG (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FPOW (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FPOWI (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FREM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FRINT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FROUND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSIN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSQRT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSUB (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FTRUNC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi); // Float Operand Expansion. bool ExpandFloatOperand(SDNode *N, unsigned OperandNo); SDValue ExpandFloatOp_BR_CC(SDNode *N); SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N); SDValue ExpandFloatOp_FP_ROUND(SDNode *N); SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N); SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N); SDValue ExpandFloatOp_SELECT_CC(SDNode *N); SDValue ExpandFloatOp_SETCC(SDNode *N); SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo); void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, SDLoc dl); //===--------------------------------------------------------------------===// // Scalarization Support: LegalizeVectorTypes.cpp //===--------------------------------------------------------------------===// /// GetScalarizedVector - Given a processed one-element vector Op which was /// scalarized to its element type, this returns the element. For example, /// if Op is a v1i32, Op = < i32 val >, this method returns val, an i32. SDValue GetScalarizedVector(SDValue Op) { SDValue &ScalarizedOp = ScalarizedVectors[Op]; RemapValue(ScalarizedOp); assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?"); return ScalarizedOp; } void SetScalarizedVector(SDValue Op, SDValue Result); // Vector Result Scalarization: <1 x ty> -> ty. void ScalarizeVectorResult(SDNode *N, unsigned OpNo); SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo); SDValue ScalarizeVecRes_BinOp(SDNode *N); SDValue ScalarizeVecRes_TernaryOp(SDNode *N); SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_InregOp(SDNode *N); SDValue ScalarizeVecRes_BITCAST(SDNode *N); SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N); SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue ScalarizeVecRes_FP_ROUND(SDNode *N); SDValue ScalarizeVecRes_FPOWI(SDNode *N); SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecRes_LOAD(LoadSDNode *N); SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N); SDValue ScalarizeVecRes_VSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT_CC(SDNode *N); SDValue ScalarizeVecRes_SETCC(SDNode *N); SDValue ScalarizeVecRes_UNDEF(SDNode *N); SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N); SDValue ScalarizeVecRes_VSETCC(SDNode *N); // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_BITCAST(SDNode *N); SDValue ScalarizeVecOp_UnaryOp(SDNode *N); SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N); SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecOp_VSELECT(SDNode *N); SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo); //===--------------------------------------------------------------------===// // Vector Splitting Support: LegalizeVectorTypes.cpp //===--------------------------------------------------------------------===// /// GetSplitVector - Given a processed vector Op which was split into vectors /// of half the size, this method returns the halves. The first elements of /// Op coincide with the elements of Lo; the remaining elements of Op coincide /// with the elements of Hi: Op is what you would get by concatenating Lo and /// Hi. For example, if Op is a v8i32 that was split into two v4i32's, then /// this method returns the two v4i32's, with Lo corresponding to the first 4 /// elements of Op, and Hi to the last 4 elements. void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi); void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi); // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>. void SplitVectorResult(SDNode *N, unsigned OpNo); void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_TRUNCATE(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp //===--------------------------------------------------------------------===// /// GetWidenedVector - Given a processed vector Op which was widened into a /// larger vector, this method returns the larger vector. The elements of /// the returned vector consist of the elements of Op followed by elements /// containing rubbish. For example, if Op is a v2i32 that was widened to a /// v4i32, then this method returns a v4i32 for which the first two elements /// are the same as those of Op, while the last two elements contain rubbish. SDValue GetWidenedVector(SDValue Op) { SDValue &WidenedOp = WidenedVectors[Op]; RemapValue(WidenedOp); assert(WidenedOp.getNode() && "Operand wasn't widened?"); return WidenedOp; } void SetWidenedVector(SDValue Op, SDValue Result); // Widen Vector Result Promotion. void WidenVectorResult(SDNode *N, unsigned ResNo); SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo); SDValue WidenVecRes_BITCAST(SDNode* N); SDValue WidenVecRes_BUILD_VECTOR(SDNode* N); SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N); SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); SDValue WidenVecRes_UNDEF(SDNode *N); SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N); SDValue WidenVecRes_VSETCC(SDNode* N); SDValue WidenVecRes_Ternary(SDNode *N); SDValue WidenVecRes_Binary(SDNode *N); SDValue WidenVecRes_BinaryCanTrap(SDNode *N); SDValue WidenVecRes_Convert(SDNode *N); SDValue WidenVecRes_POWI(SDNode *N); SDValue WidenVecRes_Shift(SDNode *N); SDValue WidenVecRes_Unary(SDNode *N); SDValue WidenVecRes_InregOp(SDNode *N); // Widen Vector Operand. bool WidenVectorOperand(SDNode *N, unsigned OpNo); SDValue WidenVecOp_BITCAST(SDNode *N); SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N); SDValue WidenVecOp_EXTEND(SDNode *N); SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); + SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_Convert(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp //===--------------------------------------------------------------------===// /// Helper GenWidenVectorLoads - Helper function to generate a set of /// loads to load a vector with a resulting wider type. It takes /// LdChain: list of chains for the load to be generated. /// Ld: load to widen SDValue GenWidenVectorLoads(SmallVectorImpl &LdChain, LoadSDNode *LD); /// GenWidenVectorExtLoads - Helper function to generate a set of extension /// loads to load a ector with a resulting wider type. It takes /// LdChain: list of chains for the load to be generated. /// Ld: load to widen /// ExtType: extension element type SDValue GenWidenVectorExtLoads(SmallVectorImpl &LdChain, LoadSDNode *LD, ISD::LoadExtType ExtType); /// Helper genWidenVectorStores - Helper function to generate a set of /// stores to store a widen vector into non-widen memory /// StChain: list of chains for the stores we have generated /// ST: store of a widen value void GenWidenVectorStores(SmallVectorImpl &StChain, StoreSDNode *ST); /// Helper genWidenVectorTruncStores - Helper function to generate a set of /// stores to store a truncate widen vector into non-widen memory /// StChain: list of chains for the stores we have generated /// ST: store of a widen value void GenWidenVectorTruncStores(SmallVectorImpl &StChain, StoreSDNode *ST); /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. SDValue ModifyToType(SDValue InOp, EVT WidenVT); //===--------------------------------------------------------------------===// // Generic Splitting: LegalizeTypesGeneric.cpp //===--------------------------------------------------------------------===// // Legalization methods which only use that the illegal type is split into two // not necessarily identical types. As such they can be used for splitting // vectors and expanding integers and floats. void GetSplitOp(SDValue Op, SDValue &Lo, SDValue &Hi) { if (Op.getValueType().isVector()) GetSplitVector(Op, Lo, Hi); else if (Op.getValueType().isInteger()) GetExpandedInteger(Op, Lo, Hi); else GetExpandedFloat(Op, Lo, Hi); } /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and /// high parts of the given value. void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi); // Generic Result Splitting. void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi); //===--------------------------------------------------------------------===// // Generic Expansion: LegalizeTypesGeneric.cpp //===--------------------------------------------------------------------===// // Legalization methods which only use that the illegal type is split into two // identical types of half the size, and that the Lo/Hi part is stored first // in memory on little/big-endian machines, followed by the Hi/Lo part. As // such they can be used for expanding integers and floats. void GetExpandedOp(SDValue Op, SDValue &Lo, SDValue &Hi) { if (Op.getValueType().isInteger()) GetExpandedInteger(Op, Lo, Hi); else GetExpandedFloat(Op, Lo, Hi); } /// This function will split the integer \p Op into \p NumElements /// operations of type \p EltVT and store them in \p Ops. void IntegerToVector(SDValue Op, unsigned NumElements, SmallVectorImpl &Ops, EVT EltVT); // Generic Result Expansion. void ExpandRes_MERGE_VALUES (SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); void ExpandRes_BITCAST (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_BUILD_PAIR (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_EXTRACT_ELEMENT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_NormalLoad (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_VAARG (SDNode *N, SDValue &Lo, SDValue &Hi); // Generic Operand Expansion. SDValue ExpandOp_BITCAST (SDNode *N); SDValue ExpandOp_BUILD_VECTOR (SDNode *N); SDValue ExpandOp_EXTRACT_ELEMENT (SDNode *N); SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N); SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N); SDValue ExpandOp_NormalStore (SDNode *N, unsigned OpNo); }; } // end namespace llvm. #endif Index: projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (revision 279025) @@ -1,3286 +1,3324 @@ //===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file performs vector type splitting and scalarization for LegalizeTypes. // Scalarization is the act of changing a computation in an illegal one-element // vector type to be a computation in its scalar element type. For example, // implementing <1 x f32> arithmetic in a scalar f32 register. This is needed // as a base case when scalarizing vector arithmetic like <4 x f32>, which // eventually decomposes to scalars if the target doesn't support v4f32 or v2f32 // types. // Splitting is the act of changing a computation in an invalid vector type to // be a computation in two vectors of half the size. For example, implementing // <128 x f32> operations in terms of two <64 x f32> operations. // //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "legalize-types" //===----------------------------------------------------------------------===// // Result Vector Scalarization: <1 x ty> -> ty. //===----------------------------------------------------------------------===// void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Scalarize node result " << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"); SDValue R = SDValue(); switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ScalarizeVectorResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif report_fatal_error("Do not know how to scalarize the result of this " "operator!\n"); case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break; case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; case ISD::CONVERT_RNDSAT: R = ScalarizeVecRes_CONVERT_RNDSAT(N); break; case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break; case ISD::FP_ROUND_INREG: R = ScalarizeVecRes_InregOp(N); break; case ISD::FPOWI: R = ScalarizeVecRes_FPOWI(N); break; case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break; case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast(N));break; case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break; case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break; case ISD::VSELECT: R = ScalarizeVecRes_VSELECT(N); break; case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break; case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break; case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; case ISD::ANY_EXTEND: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::FABS: case ISD::FCEIL: case ISD::FCOS: case ISD::FEXP: case ISD::FEXP2: case ISD::FFLOOR: case ISD::FLOG: case ISD::FLOG10: case ISD::FLOG2: case ISD::FNEARBYINT: case ISD::FNEG: case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::FRINT: case ISD::FROUND: case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: case ISD::SIGN_EXTEND: case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::ZERO_EXTEND: R = ScalarizeVecRes_UnaryOp(N); break; case ISD::ADD: case ISD::AND: case ISD::FADD: case ISD::FCOPYSIGN: case ISD::FDIV: case ISD::FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FPOW: case ISD::FREM: case ISD::FSUB: case ISD::MUL: case ISD::OR: case ISD::SDIV: case ISD::SREM: case ISD::SUB: case ISD::UDIV: case ISD::UREM: case ISD::XOR: case ISD::SHL: case ISD::SRA: case ISD::SRL: R = ScalarizeVecRes_BinOp(N); break; case ISD::FMA: R = ScalarizeVecRes_TernaryOp(N); break; } // If R is null, the sub-method took care of registering the result. if (R.getNode()) SetScalarizedVector(SDValue(N, ResNo), R); } SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(0)); SDValue RHS = GetScalarizedVector(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) { SDValue Op0 = GetScalarizedVector(N->getOperand(0)); SDValue Op1 = GetScalarizedVector(N->getOperand(1)); SDValue Op2 = GetScalarizedVector(N->getOperand(2)); return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1, Op2); } SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); return GetScalarizedVector(Op); } SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) { EVT NewVT = N->getValueType(0).getVectorElementType(); return DAG.getNode(ISD::BITCAST, SDLoc(N), NewVT, N->getOperand(0)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) { EVT EltVT = N->getValueType(0).getVectorElementType(); SDValue InOp = N->getOperand(0); // The BUILD_VECTOR operands may be of wider element types and // we may need to truncate them back to the requested return type. if (EltVT.isInteger()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp); return InOp; } SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) { EVT NewVT = N->getValueType(0).getVectorElementType(); SDValue Op0 = GetScalarizedVector(N->getOperand(0)); return DAG.getConvertRndSat(NewVT, SDLoc(N), Op0, DAG.getValueType(NewVT), DAG.getValueType(Op0.getValueType()), N->getOperand(3), N->getOperand(4), cast(N)->getCvtCode()); } SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), N->getValueType(0).getVectorElementType(), N->getOperand(0), N->getOperand(1)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) { EVT NewVT = N->getValueType(0).getVectorElementType(); SDValue Op = GetScalarizedVector(N->getOperand(0)); return DAG.getNode(ISD::FP_ROUND, SDLoc(N), NewVT, Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) { SDValue Op = GetScalarizedVector(N->getOperand(0)); return DAG.getNode(ISD::FPOWI, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) { // The value to insert may have a wider type than the vector element type, // so be sure to truncate it to the element type if necessary. SDValue Op = N->getOperand(1); EVT EltVT = N->getValueType(0).getVectorElementType(); if (Op.getValueType() != EltVT) // FIXME: Can this happen for floating point types? Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op); return Op; } SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) { assert(N->isUnindexed() && "Indexed vector load?"); SDValue Result = DAG.getLoad(ISD::UNINDEXED, N->getExtensionType(), N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(), N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()), N->getPointerInfo(), N->getMemoryVT().getVectorElementType(), N->isVolatile(), N->isNonTemporal(), N->isInvariant(), N->getOriginalAlignment(), N->getAAInfo()); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Result.getValue(1)); return Result; } SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) { // Get the dest type - it doesn't always match the input type, e.g. int_to_fp. EVT DestVT = N->getValueType(0).getVectorElementType(); SDValue Op = N->getOperand(0); EVT OpVT = Op.getValueType(); SDLoc DL(N); // The result needs scalarizing, but it's not a given that the source does. // This is a workaround for targets where it's impossible to scalarize the // result of a conversion, because the source type is legal. // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32} // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is // legal and was not scalarized. // See the similar logic in ScalarizeVecRes_VSETCC if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { Op = GetScalarizedVector(Op); } else { EVT VT = OpVT.getVectorElementType(); Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, DAG.getConstant(0, TLI.getVectorIdxTy())); } return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op); } SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) { EVT EltVT = N->getValueType(0).getVectorElementType(); EVT ExtVT = cast(N->getOperand(1))->getVT().getVectorElementType(); SDValue LHS = GetScalarizedVector(N->getOperand(0)); return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT, LHS, DAG.getValueType(ExtVT)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) { // If the operand is wider than the vector element type then it is implicitly // truncated. Make that explicit here. EVT EltVT = N->getValueType(0).getVectorElementType(); SDValue InOp = N->getOperand(0); if (InOp.getValueType() != EltVT) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp); return InOp; } SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) { SDValue Cond = GetScalarizedVector(N->getOperand(0)); SDValue LHS = GetScalarizedVector(N->getOperand(1)); TargetLowering::BooleanContent ScalarBool = TLI.getBooleanContents(false, false); TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false); // If integer and float booleans have different contents then we can't // reliably optimize in all cases. There is a full explanation for this in // DAGCombiner::visitSELECT() where the same issue affects folding // (select C, 0, 1) to (xor C, 1). if (TLI.getBooleanContents(false, false) != TLI.getBooleanContents(false, true)) { // At least try the common case where the boolean is generated by a // comparison. if (Cond->getOpcode() == ISD::SETCC) { EVT OpVT = Cond->getOperand(0)->getValueType(0); ScalarBool = TLI.getBooleanContents(OpVT.getScalarType()); VecBool = TLI.getBooleanContents(OpVT); } else ScalarBool = TargetLowering::UndefinedBooleanContent; } if (ScalarBool != VecBool) { EVT CondVT = Cond.getValueType(); switch (ScalarBool) { case TargetLowering::UndefinedBooleanContent: break; case TargetLowering::ZeroOrOneBooleanContent: assert(VecBool == TargetLowering::UndefinedBooleanContent || VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent); // Vector read from all ones, scalar expects a single 1 so mask. Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT, Cond, DAG.getConstant(1, CondVT)); break; case TargetLowering::ZeroOrNegativeOneBooleanContent: assert(VecBool == TargetLowering::UndefinedBooleanContent || VecBool == TargetLowering::ZeroOrOneBooleanContent); // Vector reads from a one, scalar from all ones so sign extend. Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT, Cond, DAG.getValueType(MVT::i1)); break; } } return DAG.getSelect(SDLoc(N), LHS.getValueType(), Cond, LHS, GetScalarizedVector(N->getOperand(2))); } SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(1)); return DAG.getSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, GetScalarizedVector(N->getOperand(2))); } SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(2)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(), N->getOperand(0), N->getOperand(1), LHS, GetScalarizedVector(N->getOperand(3)), N->getOperand(4)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) { assert(N->getValueType(0).isVector() == N->getOperand(0).getValueType().isVector() && "Scalar/Vector type mismatch"); if (N->getValueType(0).isVector()) return ScalarizeVecRes_VSETCC(N); SDValue LHS = GetScalarizedVector(N->getOperand(0)); SDValue RHS = GetScalarizedVector(N->getOperand(1)); SDLoc DL(N); // Turn it into a scalar SETCC. return DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) { return DAG.getUNDEF(N->getValueType(0).getVectorElementType()); } SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) { // Figure out if the scalar is the LHS or RHS and return it. SDValue Arg = N->getOperand(2).getOperand(0); if (Arg.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(N->getValueType(0).getVectorElementType()); unsigned Op = !cast(Arg)->isNullValue(); return GetScalarizedVector(N->getOperand(Op)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && "Operand types must be vectors"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT OpVT = LHS.getValueType(); EVT NVT = N->getValueType(0).getVectorElementType(); SDLoc DL(N); // The result needs scalarizing, but it's not a given that the source does. if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { LHS = GetScalarizedVector(LHS); RHS = GetScalarizedVector(RHS); } else { EVT VT = OpVT.getVectorElementType(); LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS, DAG.getConstant(0, TLI.getVectorIdxTy())); RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS, DAG.getConstant(0, TLI.getVectorIdxTy())); } // Turn it into a scalar SETCC. SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2)); // Vectors may have a different boolean contents to scalars. Promote the // value appropriately. ISD::NodeType ExtendCode = TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT)); return DAG.getNode(ExtendCode, DL, NVT, Res); } //===----------------------------------------------------------------------===// // Operand Vector Scalarization <1 x ty> -> ty. //===----------------------------------------------------------------------===// bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); if (!Res.getNode()) { switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to scalarize this operator's operand!"); case ISD::BITCAST: Res = ScalarizeVecOp_BITCAST(N); break; case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: case ISD::TRUNCATE: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: Res = ScalarizeVecOp_UnaryOp(N); break; case ISD::CONCAT_VECTORS: Res = ScalarizeVecOp_CONCAT_VECTORS(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::VSELECT: Res = ScalarizeVecOp_VSELECT(N); break; case ISD::STORE: Res = ScalarizeVecOp_STORE(cast(N), OpNo); break; case ISD::FP_ROUND: Res = ScalarizeVecOp_FP_ROUND(N, OpNo); break; } } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } /// ScalarizeVecOp_BITCAST - If the value to convert is a vector that needs /// to be scalarized, it must be <1 x ty>. Convert the element instead. SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) { SDValue Elt = GetScalarizedVector(N->getOperand(0)); return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Elt); } /// ScalarizeVecOp_UnaryOp - If the input is a vector that needs to be /// scalarized, it must be <1 x ty>. Do the operation on the element instead. SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) { assert(N->getValueType(0).getVectorNumElements() == 1 && "Unexpected vector type!"); SDValue Elt = GetScalarizedVector(N->getOperand(0)); SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0).getScalarType(), Elt); // Revectorize the result so the types line up with what the uses of this // expression expect. return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op); } /// ScalarizeVecOp_CONCAT_VECTORS - The vectors to concatenate have length one - /// use a BUILD_VECTOR instead. SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) { SmallVector Ops(N->getNumOperands()); for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) Ops[i] = GetScalarizedVector(N->getOperand(i)); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops); } /// ScalarizeVecOp_EXTRACT_VECTOR_ELT - If the input is a vector that needs to /// be scalarized, it must be <1 x ty>, so just return the element, ignoring the /// index. SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDValue Res = GetScalarizedVector(N->getOperand(0)); if (Res.getValueType() != N->getValueType(0)) Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Res); return Res; } /// ScalarizeVecOp_VSELECT - If the input condition is a vector that needs to be /// scalarized, it must be <1 x i1>, so just convert to a normal ISD::SELECT /// (still with vector output type since that was acceptable if we got here). SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) { SDValue ScalarCond = GetScalarizedVector(N->getOperand(0)); EVT VT = N->getValueType(0); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1), N->getOperand(2)); } /// ScalarizeVecOp_STORE - If the value to store is a vector that needs to be /// scalarized, it must be <1 x ty>. Just store the element. SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){ assert(N->isUnindexed() && "Indexed store of one-element vector?"); assert(OpNo == 1 && "Do not know how to scalarize this operand!"); SDLoc dl(N); if (N->isTruncatingStore()) return DAG.getTruncStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)), N->getBasePtr(), N->getPointerInfo(), N->getMemoryVT().getVectorElementType(), N->isVolatile(), N->isNonTemporal(), N->getAlignment(), N->getAAInfo()); return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)), N->getBasePtr(), N->getPointerInfo(), N->isVolatile(), N->isNonTemporal(), N->getOriginalAlignment(), N->getAAInfo()); } /// ScalarizeVecOp_FP_ROUND - If the value to round is a vector that needs /// to be scalarized, it must be <1 x ty>. Convert the element instead. SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) { SDValue Elt = GetScalarizedVector(N->getOperand(0)); SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N), N->getValueType(0).getVectorElementType(), Elt, N->getOperand(1)); return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res); } //===----------------------------------------------------------------------===// // Result Vector Splitting //===----------------------------------------------------------------------===// /// SplitVectorResult - This method is called when the specified result of the /// specified node is found to need vector splitting. At this point, the node /// may also have invalid operands or may have other results that need /// legalization, we just know that (at least) one result needs vector /// splitting. void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Split node result: "; N->dump(&DAG); dbgs() << "\n"); SDValue Lo, Hi; // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(ResNo), true)) return; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "SplitVectorResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif report_fatal_error("Do not know how to split the result of this " "operator!\n"); case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::VSELECT: case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break; case ISD::BUILD_VECTOR: SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break; case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break; case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break; case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break; case ISD::FP_ROUND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; case ISD::SCALAR_TO_VECTOR: SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; case ISD::MLOAD: SplitVecRes_MLOAD(cast(N), Lo, Hi); break; case ISD::SETCC: SplitVecRes_SETCC(N, Lo, Hi); break; case ISD::VECTOR_SHUFFLE: SplitVecRes_VECTOR_SHUFFLE(cast(N), Lo, Hi); break; case ISD::BSWAP: case ISD::CONVERT_RNDSAT: case ISD::CTLZ: case ISD::CTTZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTPOP: case ISD::FABS: case ISD::FCEIL: case ISD::FCOS: case ISD::FEXP: case ISD::FEXP2: case ISD::FFLOOR: case ISD::FLOG: case ISD::FLOG10: case ISD::FLOG2: case ISD::FNEARBYINT: case ISD::FNEG: case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::FRINT: case ISD::FROUND: case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: SplitVecRes_UnaryOp(N, Lo, Hi); break; case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: SplitVecRes_ExtendOp(N, Lo, Hi); break; case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::FADD: case ISD::FCOPYSIGN: case ISD::FSUB: case ISD::FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::SDIV: case ISD::UDIV: case ISD::FDIV: case ISD::FPOW: case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::UREM: case ISD::SREM: case ISD::FREM: SplitVecRes_BinOp(N, Lo, Hi); break; case ISD::FMA: SplitVecRes_TernaryOp(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. if (Lo.getNode()) SetSplitVector(SDValue(N, ResNo), Lo, Hi); } void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHSLo, LHSHi; GetSplitVector(N->getOperand(0), LHSLo, LHSHi); SDValue RHSLo, RHSHi; GetSplitVector(N->getOperand(1), RHSLo, RHSHi); SDLoc dl(N); Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo); Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi); } void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Op0Lo, Op0Hi; GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi); SDValue Op1Lo, Op1Hi; GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi); SDValue Op2Lo, Op2Hi; GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi); SDLoc dl(N); Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(), Op0Lo, Op1Lo, Op2Lo); Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(), Op0Hi, Op1Hi, Op2Hi); } void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // We know the result is a vector. The input may be either a vector or a // scalar value. EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); SDLoc dl(N); SDValue InOp = N->getOperand(0); EVT InVT = InOp.getValueType(); // Handle some special cases efficiently. switch (getTypeAction(InVT)) { case TargetLowering::TypeLegal: case TargetLowering::TypePromoteInteger: case TargetLowering::TypeSoftenFloat: case TargetLowering::TypeScalarizeVector: case TargetLowering::TypeWidenVector: break; case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: // A scalar to vector conversion, where the scalar needs expansion. // If the vector is being split in two then we can just convert the // expanded pieces. if (LoVT == HiVT) { GetExpandedOp(InOp, Lo, Hi); if (TLI.isBigEndian()) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); return; } break; case TargetLowering::TypeSplitVector: // If the input is a vector that needs to be split, convert each split // piece of the input now. GetSplitVector(InOp, Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); return; } // In the general case, convert the input to an integer and split it by hand. EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits()); EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits()); if (TLI.isBigEndian()) std::swap(LoIntVT, HiIntVT); SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi); if (TLI.isBigEndian()) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); } void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); unsigned LoNumElts = LoVT.getVectorNumElements(); SmallVector LoOps(N->op_begin(), N->op_begin()+LoNumElts); Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps); SmallVector HiOps(N->op_begin()+LoNumElts, N->op_end()); Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps); } void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS"); SDLoc dl(N); unsigned NumSubvectors = N->getNumOperands() / 2; if (NumSubvectors == 1) { Lo = N->getOperand(0); Hi = N->getOperand(1); return; } EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); SmallVector LoOps(N->op_begin(), N->op_begin()+NumSubvectors); Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps); SmallVector HiOps(N->op_begin()+NumSubvectors, N->op_end()); Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps); } void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Vec = N->getOperand(0); SDValue Idx = N->getOperand(1); SDLoc dl(N); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx); uint64_t IdxVal = cast(Idx)->getZExtValue(); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec, DAG.getConstant(IdxVal + LoVT.getVectorNumElements(), TLI.getVectorIdxTy())); } void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); SDValue Idx = N->getOperand(2); SDLoc dl(N); GetSplitVector(Vec, Lo, Hi); // Spill the vector to the stack. EVT VecVT = Vec.getValueType(); EVT SubVecVT = VecVT.getVectorElementType(); SDValue StackPtr = DAG.CreateStackTemporary(VecVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo(), false, false, 0); // Store the new subvector into the specified index. SDValue SubVecPtr = GetVectorElementPointer(StackPtr, SubVecVT, Idx); Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); unsigned Alignment = TLI.getDataLayout()->getPrefTypeAlignment(VecType); Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo(), false, false, 0); // Load the Lo part from the stack slot. Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(), false, false, false, 0); // Increment the pointer to the other part. unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8; StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, DAG.getConstant(IncrementSize, StackPtr.getValueType())); // Load the Hi part from the stack slot. Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(), false, false, false, MinAlign(Alignment, IncrementSize)); } void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetSplitVector(N->getOperand(0), Lo, Hi); Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1)); Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1)); } void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHSLo, LHSHi; GetSplitVector(N->getOperand(0), LHSLo, LHSHi); SDLoc dl(N); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(cast(N->getOperand(1))->getVT()); Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, DAG.getValueType(LoVT)); Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, DAG.getValueType(HiVT)); } void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Vec = N->getOperand(0); SDValue Elt = N->getOperand(1); SDValue Idx = N->getOperand(2); SDLoc dl(N); GetSplitVector(Vec, Lo, Hi); if (ConstantSDNode *CIdx = dyn_cast(Idx)) { unsigned IdxVal = CIdx->getZExtValue(); unsigned LoNumElts = Lo.getValueType().getVectorNumElements(); if (IdxVal < LoNumElts) Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Lo.getValueType(), Lo, Elt, Idx); else Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt, DAG.getConstant(IdxVal - LoNumElts, TLI.getVectorIdxTy())); return; } // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(0), true)) return; // Spill the vector to the stack. EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); SDValue StackPtr = DAG.CreateStackTemporary(VecVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo(), false, false, 0); // Store the new element. This may be larger than the vector element type, // so use a truncating store. SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx); Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); unsigned Alignment = TLI.getDataLayout()->getPrefTypeAlignment(VecType); Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT, false, false, 0); // Load the Lo part from the stack slot. Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(), false, false, false, 0); // Increment the pointer to the other part. unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8; StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, DAG.getConstant(IncrementSize, StackPtr.getValueType())); // Load the Hi part from the stack slot. Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(), false, false, false, MinAlign(Alignment, IncrementSize)); } void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0)); Hi = DAG.getUNDEF(HiVT); } void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi) { assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!"); EVT LoVT, HiVT; SDLoc dl(LD); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0)); ISD::LoadExtType ExtType = LD->getExtensionType(); SDValue Ch = LD->getChain(); SDValue Ptr = LD->getBasePtr(); SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); EVT MemoryVT = LD->getMemoryVT(); unsigned Alignment = LD->getOriginalAlignment(); bool isVolatile = LD->isVolatile(); bool isNonTemporal = LD->isNonTemporal(); bool isInvariant = LD->isInvariant(); AAMDNodes AAInfo = LD->getAAInfo(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset, LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset, LD->getPointerInfo().getWithOffset(IncrementSize), HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); // Build a factor node to remember that this load is independent of the // other one. Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(LD, 1), Ch); } void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; SDLoc dl(MLD); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); SDValue Ch = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); SDValue Mask = MLD->getMask(); unsigned Alignment = MLD->getOriginalAlignment(); + ISD::LoadExtType ExtType = MLD->getExtensionType(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; SDValue MaskLo, MaskHi; std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MLD->getMemoryVT(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue Src0 = MLD->getSrc0(); SDValue Src0Lo, Src0Hi; std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); - Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO); + Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO, + ExtType); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); - Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO); + Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO, + ExtType); // Build a factor node to remember that this load is independent of the // other one. Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MLD, 1), Ch); } void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && "Operand types must be vectors"); EVT LoVT, HiVT; SDLoc DL(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); // Split the input. SDValue LL, LH, RL, RH; std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); } void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi) { // Get the dest types - they may not match the input types, e.g. int_to_fp. EVT LoVT, HiVT; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); // If the input also splits, handle it directly for a compile time speedup. // Otherwise split it by hand. EVT InVT = N->getOperand(0).getValueType(); if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) GetSplitVector(N->getOperand(0), Lo, Hi); else std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); if (N->getOpcode() == ISD::FP_ROUND) { Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1)); Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1)); } else if (N->getOpcode() == ISD::CONVERT_RNDSAT) { SDValue DTyOpLo = DAG.getValueType(LoVT); SDValue DTyOpHi = DAG.getValueType(HiVT); SDValue STyOpLo = DAG.getValueType(Lo.getValueType()); SDValue STyOpHi = DAG.getValueType(Hi.getValueType()); SDValue RndOp = N->getOperand(3); SDValue SatOp = N->getOperand(4); ISD::CvtCode CvtCode = cast(N)->getCvtCode(); Lo = DAG.getConvertRndSat(LoVT, dl, Lo, DTyOpLo, STyOpLo, RndOp, SatOp, CvtCode); Hi = DAG.getConvertRndSat(HiVT, dl, Hi, DTyOpHi, STyOpHi, RndOp, SatOp, CvtCode); } else { Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi); } } void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT SrcVT = N->getOperand(0).getValueType(); EVT DestVT = N->getValueType(0); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT); // We can do better than a generic split operation if the extend is doing // more than just doubling the width of the elements and the following are // true: // - The number of vector elements is even, // - the source type is legal, // - the type of a split source is illegal, // - the type of an extended (by doubling element size) source is legal, and // - the type of that extended source when split is legal. // // This won't necessarily completely legalize the operation, but it will // more effectively move in the right direction and prevent falling down // to scalarization in many cases due to the input vector being split too // far. unsigned NumElements = SrcVT.getVectorNumElements(); if ((NumElements & 1) == 0 && SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) { LLVMContext &Ctx = *DAG.getContext(); EVT NewSrcVT = EVT::getVectorVT( Ctx, EVT::getIntegerVT( Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2), NumElements); EVT SplitSrcVT = EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2); EVT SplitLoVT, SplitHiVT; std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT); if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) && TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) { DEBUG(dbgs() << "Split vector extend via incremental extend:"; N->dump(&DAG); dbgs() << "\n"); // Extend the source vector by one step. SDValue NewSrc = DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0)); // Get the low and high halves of the new, extended one step, vector. std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl); // Extend those vector halves the rest of the way. Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi); return; } } // Fall back to the generic unary operator splitting otherwise. SplitVecRes_UnaryOp(N, Lo, Hi); } void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi) { // The low and high parts of the original input give four input vectors. SDValue Inputs[4]; SDLoc dl(N); GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]); GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]); EVT NewVT = Inputs[0].getValueType(); unsigned NewElts = NewVT.getVectorNumElements(); // If Lo or Hi uses elements from at most two of the four input vectors, then // express it as a vector shuffle of those two inputs. Otherwise extract the // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR. SmallVector Ops; for (unsigned High = 0; High < 2; ++High) { SDValue &Output = High ? Hi : Lo; // Build a shuffle mask for the output, discovering on the fly which // input vectors to use as shuffle operands (recorded in InputUsed). // If building a suitable shuffle vector proves too hard, then bail // out with useBuildVector set. unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered. unsigned FirstMaskIdx = High * NewElts; bool useBuildVector = false; for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { // The mask element. This indexes into the input. int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset); // The input vector this mask element indexes into. unsigned Input = (unsigned)Idx / NewElts; if (Input >= array_lengthof(Inputs)) { // The mask element does not index into any input vector. Ops.push_back(-1); continue; } // Turn the index into an offset from the start of the input vector. Idx -= Input * NewElts; // Find or create a shuffle vector operand to hold this input. unsigned OpNo; for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { if (InputUsed[OpNo] == Input) { // This input vector is already an operand. break; } else if (InputUsed[OpNo] == -1U) { // Create a new operand for this input vector. InputUsed[OpNo] = Input; break; } } if (OpNo >= array_lengthof(InputUsed)) { // More than two input vectors used! Give up on trying to create a // shuffle vector. Insert all elements into a BUILD_VECTOR instead. useBuildVector = true; break; } // Add the mask index for the new shuffle vector. Ops.push_back(Idx + OpNo * NewElts); } if (useBuildVector) { EVT EltVT = NewVT.getVectorElementType(); SmallVector SVOps; // Extract the input elements by hand. for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { // The mask element. This indexes into the input. int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset); // The input vector this mask element indexes into. unsigned Input = (unsigned)Idx / NewElts; if (Input >= array_lengthof(Inputs)) { // The mask element is "undef" or indexes off the end of the input. SVOps.push_back(DAG.getUNDEF(EltVT)); continue; } // Turn the index into an offset from the start of the input vector. Idx -= Input * NewElts; // Extract the vector element by hand. SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Inputs[Input], DAG.getConstant(Idx, TLI.getVectorIdxTy()))); } // Construct the Lo/Hi output using a BUILD_VECTOR. Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps); } else if (InputUsed[0] == -1U) { // No input vectors were used! The result is undefined. Output = DAG.getUNDEF(NewVT); } else { SDValue Op0 = Inputs[InputUsed[0]]; // If only one input was used, use an undefined vector for the other. SDValue Op1 = InputUsed[1] == -1U ? DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]]; // At least one input vector was used. Create a new shuffle vector. Output = DAG.getVectorShuffle(NewVT, dl, Op0, Op1, &Ops[0]); } Ops.clear(); } } //===----------------------------------------------------------------------===// // Operand Vector Splitting //===----------------------------------------------------------------------===// /// SplitVectorOperand - This method is called when the specified operand of the /// specified node is found to need vector splitting. At this point, all of the /// result types of the node are known to be legal, but other operands of the /// node may need legalization as well as the specified one. bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Split node operand: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); // See if the target wants to custom split this node. if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) return false; if (!Res.getNode()) { switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "SplitVectorOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif report_fatal_error("Do not know how to split this operator's " "operand!\n"); case ISD::SETCC: Res = SplitVecOp_VSETCC(N); break; case ISD::BITCAST: Res = SplitVecOp_BITCAST(N); break; case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break; case ISD::TRUNCATE: Res = SplitVecOp_TRUNCATE(N); break; case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break; case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; case ISD::MSTORE: Res = SplitVecOp_MSTORE(cast(N), OpNo); break; case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; case ISD::CTTZ: case ISD::CTLZ: case ISD::CTPOP: case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::FTRUNC: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Res = SplitVecOp_UnaryOp(N); break; } } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) { // The only possibility for an illegal operand is the mask, since result type // legalization would have handled this node already otherwise. assert(OpNo == 0 && "Illegal operand must be mask"); SDValue Mask = N->getOperand(0); SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); EVT Src0VT = Src0.getValueType(); SDLoc DL(N); assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?"); SDValue Lo, Hi; GetSplitVector(N->getOperand(0), Lo, Hi); assert(Lo.getValueType() == Hi.getValueType() && "Lo and Hi have differing types"); EVT LoOpVT, HiOpVT; std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT); assert(LoOpVT == HiOpVT && "Asymmetric vector split?"); SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask; std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL); std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL); std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL); SDValue LoSelect = DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1); SDValue HiSelect = DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1); return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect); } SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); SDValue Lo, Hi; SDLoc dl(N); GetSplitVector(N->getOperand(0), Lo, Hi); EVT InVT = Lo.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), InVT.getVectorNumElements()); Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo); Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) { // For example, i64 = BITCAST v4i16 on alpha. Typically the vector will // end up being split all the way down to individual components. Convert the // split pieces into integers and reassemble. SDValue Lo, Hi; GetSplitVector(N->getOperand(0), Lo, Hi); Lo = BitConvertToInteger(Lo); Hi = BitConvertToInteger(Hi); if (TLI.isBigEndian()) std::swap(Lo, Hi); return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), JoinIntegers(Lo, Hi)); } SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); SDValue Idx = N->getOperand(1); SDLoc dl(N); SDValue Lo, Hi; GetSplitVector(N->getOperand(0), Lo, Hi); uint64_t LoElts = Lo.getValueType().getVectorNumElements(); uint64_t IdxVal = cast(Idx)->getZExtValue(); if (IdxVal < LoElts) { assert(IdxVal + SubVT.getVectorNumElements() <= LoElts && "Extracted subvector crosses vector split!"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx); } else { return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi, DAG.getConstant(IdxVal - LoElts, Idx.getValueType())); } } SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDValue Vec = N->getOperand(0); SDValue Idx = N->getOperand(1); EVT VecVT = Vec.getValueType(); if (isa(Idx)) { uint64_t IdxVal = cast(Idx)->getZExtValue(); assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!"); SDValue Lo, Hi; GetSplitVector(Vec, Lo, Hi); uint64_t LoElts = Lo.getValueType().getVectorNumElements(); if (IdxVal < LoElts) return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0); return SDValue(DAG.UpdateNodeOperands(N, Hi, DAG.getConstant(IdxVal - LoElts, Idx.getValueType())), 0); } // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(0), true)) return SDValue(); // Store the vector to the stack. EVT EltVT = VecVT.getVectorElementType(); SDLoc dl(N); SDValue StackPtr = DAG.CreateStackTemporary(VecVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo(), false, false, 0); // Load back the required element. StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx); return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr, MachinePointerInfo(), EltVT, false, false, false, 0); } SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); SDValue Mask = N->getMask(); - SDValue Data = N->getData(); + SDValue Data = N->getValue(); EVT MemoryVT = N->getMemoryVT(); unsigned Alignment = N->getOriginalAlignment(); SDLoc DL(N); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; GetSplitVector(Data, DataLo, DataHi); SDValue MaskLo, MaskHi; GetSplitVector(Mask, MaskLo, MaskHi); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == Data->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; SDValue Lo, Hi; MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(N->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO); + Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, + N->isTruncatingStore()); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); MMO = DAG.getMachineFunction(). getMachineMemOperand(N->getPointerInfo(), MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment, N->getAAInfo(), N->getRanges()); - Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO); + Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, + N->isTruncatingStore()); // Build a factor node to remember that this store is independent of the // other one. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { assert(N->isUnindexed() && "Indexed store of vector?"); assert(OpNo == 1 && "Can only split the stored value"); SDLoc DL(N); bool isTruncating = N->isTruncatingStore(); SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); EVT MemoryVT = N->getMemoryVT(); unsigned Alignment = N->getOriginalAlignment(); bool isVol = N->isVolatile(); bool isNT = N->isNonTemporal(); AAMDNodes AAInfo = N->getAAInfo(); SDValue Lo, Hi; GetSplitVector(N->getOperand(1), Lo, Hi); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; if (isTruncating) Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT, isVol, isNT, Alignment, AAInfo); else Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), isVol, isNT, Alignment, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, Ptr.getValueType())); if (isTruncating) Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), HiMemVT, isVol, isNT, Alignment, AAInfo); else Hi = DAG.getStore(Ch, DL, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), isVol, isNT, Alignment, AAInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) { SDLoc DL(N); // The input operands all must have the same type, and we know the result // type is valid. Convert this to a buildvector which extracts all the // input elements. // TODO: If the input elements are power-two vectors, we could convert this to // a new CONCAT_VECTORS node with elements that are half-wide. SmallVector Elts; EVT EltVT = N->getValueType(0).getVectorElementType(); for (unsigned op = 0, e = N->getNumOperands(); op != e; ++op) { SDValue Op = N->getOperand(op); for (unsigned i = 0, e = Op.getValueType().getVectorNumElements(); i != e; ++i) { Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op, DAG.getConstant(i, TLI.getVectorIdxTy()))); } } return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts); } SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) { // The result type is legal, but the input type is illegal. If splitting // ends up with the result type of each half still being legal, just // do that. If, however, that would result in an illegal result type, // we can try to get more clever with power-two vectors. Specifically, // split the input type, but also widen the result element size, then // concatenate the halves and truncate again. For example, consider a target // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do: // %inlo = v4i32 extract_subvector %in, 0 // %inhi = v4i32 extract_subvector %in, 4 // %lo16 = v4i16 trunc v4i32 %inlo // %hi16 = v4i16 trunc v4i32 %inhi // %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16 // %res = v8i8 trunc v8i16 %in16 // // Without this transform, the original truncate would end up being // scalarized, which is pretty much always a last resort. SDValue InVec = N->getOperand(0); EVT InVT = InVec->getValueType(0); EVT OutVT = N->getValueType(0); unsigned NumElements = OutVT.getVectorNumElements(); // Widening should have already made sure this is a power-two vector // if we're trying to split it at all. assert() that's true, just in case. assert(!(NumElements & 1) && "Splitting vector, but not in half!"); unsigned InElementSize = InVT.getVectorElementType().getSizeInBits(); unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits(); // If the input elements are only 1/2 the width of the result elements, // just use the normal splitting. Our trick only work if there's room // to split more than once. if (InElementSize <= OutElementSize * 2) return SplitVecOp_UnaryOp(N); SDLoc DL(N); // Extract the halves of the input via extract_subvector. SDValue InLoVec, InHiVec; std::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL); // Truncate them to 1/2 the element size. EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements/2); SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec); SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec); // Concatenate them to get the full intermediate truncation result. EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements); SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo, HalfHi); // Now finish up by truncating all the way down to the original result // type. This should normally be something that ends up being legal directly, // but in theory if a target has very wide vectors and an annoyingly // restricted set of legal types, this split can chain to build things up. return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec); } SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && "Operand types must be vectors"); // The result has a legal vector type, but the input needs splitting. SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes; SDLoc DL(N); GetSplitVector(N->getOperand(0), Lo0, Hi0); GetSplitVector(N->getOperand(1), Lo1, Hi1); unsigned PartElements = Lo0.getValueType().getVectorNumElements(); EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements); EVT WideResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 2*PartElements); LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2)); HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2)); SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes); return PromoteTargetBoolean(Con, N->getValueType(0)); } SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); SDValue Lo, Hi; SDLoc DL(N); GetSplitVector(N->getOperand(0), Lo, Hi); EVT InVT = Lo.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), InVT.getVectorNumElements()); Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1)); Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } //===----------------------------------------------------------------------===// // Result Vector Widening //===----------------------------------------------------------------------===// void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Widen node result " << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"); // See if the target wants to custom widen this node. if (CustomWidenLowerNode(N, N->getValueType(ResNo))) return; SDValue Res = SDValue(); switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "WidenVectorResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to widen the result of this operator!"); case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break; case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = WidenVecRes_CONCAT_VECTORS(N); break; case ISD::CONVERT_RNDSAT: Res = WidenVecRes_CONVERT_RNDSAT(N); break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::FP_ROUND_INREG: Res = WidenVecRes_InregOp(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; case ISD::SCALAR_TO_VECTOR: Res = WidenVecRes_SCALAR_TO_VECTOR(N); break; case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break; case ISD::VSELECT: case ISD::SELECT: Res = WidenVecRes_SELECT(N); break; case ISD::SELECT_CC: Res = WidenVecRes_SELECT_CC(N); break; case ISD::SETCC: Res = WidenVecRes_SETCC(N); break; case ISD::UNDEF: Res = WidenVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: Res = WidenVecRes_VECTOR_SHUFFLE(cast(N)); break; case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast(N)); break; case ISD::ADD: case ISD::AND: case ISD::MUL: case ISD::MULHS: case ISD::MULHU: case ISD::OR: case ISD::SUB: case ISD::XOR: case ISD::FMINNUM: case ISD::FMAXNUM: Res = WidenVecRes_Binary(N); break; case ISD::FADD: case ISD::FCOPYSIGN: case ISD::FMUL: case ISD::FPOW: case ISD::FSUB: case ISD::FDIV: case ISD::FREM: case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: Res = WidenVecRes_BinaryCanTrap(N); break; case ISD::FPOWI: Res = WidenVecRes_POWI(N); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: Res = WidenVecRes_Shift(N); break; case ISD::ANY_EXTEND: case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::SIGN_EXTEND: case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::ZERO_EXTEND: Res = WidenVecRes_Convert(N); break; case ISD::BSWAP: case ISD::CTLZ: case ISD::CTPOP: case ISD::CTTZ: case ISD::FABS: case ISD::FCEIL: case ISD::FCOS: case ISD::FEXP: case ISD::FEXP2: case ISD::FFLOOR: case ISD::FLOG: case ISD::FLOG10: case ISD::FLOG2: case ISD::FNEARBYINT: case ISD::FNEG: case ISD::FRINT: case ISD::FROUND: case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: Res = WidenVecRes_Unary(N); break; case ISD::FMA: Res = WidenVecRes_Ternary(N); break; } // If Res is null, the sub-method took care of registering the result. if (Res.getNode()) SetWidenedVector(SDValue(N, ResNo), Res); } SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) { // Ternary op widening. SDLoc dl(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); SDValue InOp3 = GetWidenedVector(N->getOperand(2)); return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3); } SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) { // Binary op widening. SDLoc dl(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2); } SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { // Binary op widening for operations that can trap. unsigned Opcode = N->getOpcode(); SDLoc dl(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); EVT WidenEltVT = WidenVT.getVectorElementType(); EVT VT = WidenVT; unsigned NumElts = VT.getVectorNumElements(); while (!TLI.isTypeLegal(VT) && NumElts != 1) { NumElts = NumElts / 2; VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); } if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) { // Operation doesn't trap so just widen as normal. SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2); } // No legal vector version so unroll the vector operation and then widen. if (NumElts == 1) return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements()); // Since the operation can trap, apply operation on the original vector. EVT MaxVT = VT; SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); unsigned CurNumElts = N->getValueType(0).getVectorNumElements(); SmallVector ConcatOps(CurNumElts); unsigned ConcatEnd = 0; // Current ConcatOps index. int Idx = 0; // Current Idx into input vectors. // NumElts := greatest legal vector size (at most WidenVT) // while (orig. vector has unhandled elements) { // take munches of size NumElts from the beginning and add to ConcatOps // NumElts := next smaller supported vector size or 1 // } while (CurNumElts != 0) { while (CurNumElts >= NumElts) { SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1, DAG.getConstant(Idx, TLI.getVectorIdxTy())); SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2, DAG.getConstant(Idx, TLI.getVectorIdxTy())); ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2); Idx += NumElts; CurNumElts -= NumElts; } do { NumElts = NumElts / 2; VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); } while (!TLI.isTypeLegal(VT) && NumElts != 1); if (NumElts == 1) { for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) { SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp1, DAG.getConstant(Idx, TLI.getVectorIdxTy())); SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2, DAG.getConstant(Idx, TLI.getVectorIdxTy())); ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT, EOp1, EOp2); } CurNumElts = 0; } } // Check to see if we have a single operation with the widen type. if (ConcatEnd == 1) { VT = ConcatOps[0].getValueType(); if (VT == WidenVT) return ConcatOps[0]; } // while (Some element of ConcatOps is not of type MaxVT) { // From the end of ConcatOps, collect elements of the same type and put // them into an op of the next larger supported type // } while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) { Idx = ConcatEnd - 1; VT = ConcatOps[Idx--].getValueType(); while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT) Idx--; int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1; EVT NextVT; do { NextSize *= 2; NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize); } while (!TLI.isTypeLegal(NextVT)); if (!VT.isVector()) { // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT SDValue VecOp = DAG.getUNDEF(NextVT); unsigned NumToInsert = ConcatEnd - Idx - 1; for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) { VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp, ConcatOps[OpIdx], DAG.getConstant(i, TLI.getVectorIdxTy())); } ConcatOps[Idx+1] = VecOp; ConcatEnd = Idx + 2; } else { // Vector type, create a CONCAT_VECTORS of type NextVT SDValue undefVec = DAG.getUNDEF(VT); unsigned OpsToConcat = NextSize/VT.getVectorNumElements(); SmallVector SubConcatOps(OpsToConcat); unsigned RealVals = ConcatEnd - Idx - 1; unsigned SubConcatEnd = 0; unsigned SubConcatIdx = Idx + 1; while (SubConcatEnd < RealVals) SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx]; while (SubConcatEnd < OpsToConcat) SubConcatOps[SubConcatEnd++] = undefVec; ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NextVT, SubConcatOps); ConcatEnd = SubConcatIdx + 1; } } // Check to see if we have a single operation with the widen type. if (ConcatEnd == 1) { VT = ConcatOps[0].getValueType(); if (VT == WidenVT) return ConcatOps[0]; } // add undefs of size MaxVT until ConcatOps grows to length of WidenVT unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements(); if (NumOps != ConcatEnd ) { SDValue UndefVal = DAG.getUNDEF(MaxVT); for (unsigned j = ConcatEnd; j < NumOps; ++j) ConcatOps[j] = UndefVal; } return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, makeArrayRef(ConcatOps.data(), NumOps)); } SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { SDValue InOp = N->getOperand(0); SDLoc DL(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); EVT InVT = InOp.getValueType(); EVT InEltVT = InVT.getVectorElementType(); EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts); unsigned Opcode = N->getOpcode(); unsigned InVTNumElts = InVT.getVectorNumElements(); if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(N->getOperand(0)); InVT = InOp.getValueType(); InVTNumElts = InVT.getVectorNumElements(); if (InVTNumElts == WidenNumElts) { if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InOp); return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1)); } } if (TLI.isTypeLegal(InWidenVT)) { // Because the result and the input are different vector types, widening // the result could create a legal type but widening the input might make // it an illegal type that might lead to repeatedly splitting the input // and then widening it. To avoid this, we widen the input only if // it results in a legal type. if (WidenNumElts % InVTNumElts == 0) { // Widen the input and call convert on the widened input vector. unsigned NumConcat = WidenNumElts/InVTNumElts; SmallVector Ops(NumConcat); Ops[0] = InOp; SDValue UndefVal = DAG.getUNDEF(InVT); for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = UndefVal; SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops); if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InVec); return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1)); } if (InVTNumElts % WidenNumElts == 0) { SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp, DAG.getConstant(0, TLI.getVectorIdxTy())); // Extract the input and convert the shorten input vector. if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InVal); return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1)); } } // Otherwise unroll into some nasty scalar code and rebuild the vector. SmallVector Ops(WidenNumElts); EVT EltVT = WidenVT.getVectorElementType(); unsigned MinElts = std::min(InVTNumElts, WidenNumElts); unsigned i; for (i=0; i < MinElts; ++i) { SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp, DAG.getConstant(i, TLI.getVectorIdxTy())); if (N->getNumOperands() == 1) Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val); else Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1)); } SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); SDValue ShOp = N->getOperand(1); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); } SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); SDValue ShOp = N->getOperand(1); EVT ShVT = ShOp.getValueType(); if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) { ShOp = GetWidenedVector(ShOp); ShVT = ShOp.getValueType(); } EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(), ShVT.getVectorElementType(), WidenVT.getVectorNumElements()); if (ShVT != ShWidenVT) ShOp = ModifyToType(ShOp, ShWidenVT); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); } SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) { // Unary op widening. EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp); } SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), cast(N->getOperand(1))->getVT() .getVectorElementType(), WidenVT.getVectorNumElements()); SDValue WidenLHS = GetWidenedVector(N->getOperand(0)); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, WidenLHS, DAG.getValueType(ExtVT)); } SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo); return GetWidenedVector(WidenVec); } SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { SDValue InOp = N->getOperand(0); EVT InVT = InOp.getValueType(); EVT VT = N->getValueType(0); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDLoc dl(N); switch (getTypeAction(InVT)) { case TargetLowering::TypeLegal: break; case TargetLowering::TypePromoteInteger: // If the incoming type is a vector that is being promoted, then // we know that the elements are arranged differently and that we // must perform the conversion using a stack slot. if (InVT.isVector()) break; // If the InOp is promoted to the same size, convert it. Otherwise, // fall out of the switch and widen the promoted input. InOp = GetPromotedInteger(InOp); InVT = InOp.getValueType(); if (WidenVT.bitsEq(InVT)) return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp); break; case TargetLowering::TypeSoftenFloat: case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: case TargetLowering::TypeScalarizeVector: case TargetLowering::TypeSplitVector: break; case TargetLowering::TypeWidenVector: // If the InOp is widened to the same size, convert it. Otherwise, fall // out of the switch and widen the widened input. InOp = GetWidenedVector(InOp); InVT = InOp.getValueType(); if (WidenVT.bitsEq(InVT)) // The input widens to the same size. Convert to the widen value. return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp); break; } unsigned WidenSize = WidenVT.getSizeInBits(); unsigned InSize = InVT.getSizeInBits(); // x86mmx is not an acceptable vector element type, so don't try. if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) { // Determine new input vector type. The new input vector type will use // the same element type (if its a vector) or use the input type as a // vector. It is the same size as the type to widen to. EVT NewInVT; unsigned NewNumElts = WidenSize / InSize; if (InVT.isVector()) { EVT InEltVT = InVT.getVectorElementType(); NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenSize / InEltVT.getSizeInBits()); } else { NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts); } if (TLI.isTypeLegal(NewInVT)) { // Because the result and the input are different vector types, widening // the result could create a legal type but widening the input might make // it an illegal type that might lead to repeatedly splitting the input // and then widening it. To avoid this, we widen the input only if // it results in a legal type. SmallVector Ops(NewNumElts); SDValue UndefVal = DAG.getUNDEF(InVT); Ops[0] = InOp; for (unsigned i = 1; i < NewNumElts; ++i) Ops[i] = UndefVal; SDValue NewVec; if (InVT.isVector()) NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); else NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops); return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec); } } return CreateStackStoreLoad(InOp, WidenVT); } SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SDLoc dl(N); // Build a vector with undefined for the new nodes. EVT VT = N->getValueType(0); // Integer BUILD_VECTOR operands may be larger than the node's vector element // type. The UNDEFs need to have the same type as the existing operands. EVT EltVT = N->getOperand(0).getValueType(); unsigned NumElts = VT.getVectorNumElements(); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); unsigned WidenNumElts = WidenVT.getVectorNumElements(); SmallVector NewOps(N->op_begin(), N->op_end()); assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps); } SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { EVT InVT = N->getOperand(0).getValueType(); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); unsigned WidenNumElts = WidenVT.getVectorNumElements(); unsigned NumInElts = InVT.getVectorNumElements(); unsigned NumOperands = N->getNumOperands(); bool InputWidened = false; // Indicates we need to widen the input. if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) { if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) { // Add undef vectors to widen to correct length. unsigned NumConcat = WidenVT.getVectorNumElements() / InVT.getVectorNumElements(); SDValue UndefVal = DAG.getUNDEF(InVT); SmallVector Ops(NumConcat); for (unsigned i=0; i < NumOperands; ++i) Ops[i] = N->getOperand(i); for (unsigned i = NumOperands; i != NumConcat; ++i) Ops[i] = UndefVal; return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops); } } else { InputWidened = true; if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) { // The inputs and the result are widen to the same value. unsigned i; for (i=1; i < NumOperands; ++i) if (N->getOperand(i).getOpcode() != ISD::UNDEF) break; if (i == NumOperands) // Everything but the first operand is an UNDEF so just return the // widened first operand. return GetWidenedVector(N->getOperand(0)); if (NumOperands == 2) { // Replace concat of two operands with a shuffle. SmallVector MaskOps(WidenNumElts, -1); for (unsigned i = 0; i < NumInElts; ++i) { MaskOps[i] = i; MaskOps[i + NumInElts] = i + WidenNumElts; } return DAG.getVectorShuffle(WidenVT, dl, GetWidenedVector(N->getOperand(0)), GetWidenedVector(N->getOperand(1)), &MaskOps[0]); } } } // Fall back to use extracts and build vector. EVT EltVT = WidenVT.getVectorElementType(); SmallVector Ops(WidenNumElts); unsigned Idx = 0; for (unsigned i=0; i < NumOperands; ++i) { SDValue InOp = N->getOperand(i); if (InputWidened) InOp = GetWidenedVector(InOp); for (unsigned j=0; j < NumInElts; ++j) Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(j, TLI.getVectorIdxTy())); } SDValue UndefVal = DAG.getUNDEF(EltVT); for (; Idx < WidenNumElts; ++Idx) Ops[Idx] = UndefVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) { SDLoc dl(N); SDValue InOp = N->getOperand(0); SDValue RndOp = N->getOperand(3); SDValue SatOp = N->getOperand(4); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); EVT InVT = InOp.getValueType(); EVT InEltVT = InVT.getVectorElementType(); EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts); SDValue DTyOp = DAG.getValueType(WidenVT); SDValue STyOp = DAG.getValueType(InWidenVT); ISD::CvtCode CvtCode = cast(N)->getCvtCode(); unsigned InVTNumElts = InVT.getVectorNumElements(); if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(InOp); InVT = InOp.getValueType(); InVTNumElts = InVT.getVectorNumElements(); if (InVTNumElts == WidenNumElts) return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp, SatOp, CvtCode); } if (TLI.isTypeLegal(InWidenVT)) { // Because the result and the input are different vector types, widening // the result could create a legal type but widening the input might make // it an illegal type that might lead to repeatedly splitting the input // and then widening it. To avoid this, we widen the input only if // it results in a legal type. if (WidenNumElts % InVTNumElts == 0) { // Widen the input and call convert on the widened input vector. unsigned NumConcat = WidenNumElts/InVTNumElts; SmallVector Ops(NumConcat); Ops[0] = InOp; SDValue UndefVal = DAG.getUNDEF(InVT); for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = UndefVal; InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, Ops); return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp, SatOp, CvtCode); } if (InVTNumElts % WidenNumElts == 0) { // Extract the input and convert the shorten input vector. InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp, DAG.getConstant(0, TLI.getVectorIdxTy())); return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp, SatOp, CvtCode); } } // Otherwise unroll into some nasty scalar code and rebuild the vector. SmallVector Ops(WidenNumElts); EVT EltVT = WidenVT.getVectorElementType(); DTyOp = DAG.getValueType(EltVT); STyOp = DAG.getValueType(InEltVT); unsigned MinElts = std::min(InVTNumElts, WidenNumElts); unsigned i; for (i=0; i < MinElts; ++i) { SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp, DAG.getConstant(i, TLI.getVectorIdxTy())); Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp, SatOp, CvtCode); } SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { EVT VT = N->getValueType(0); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); unsigned WidenNumElts = WidenVT.getVectorNumElements(); SDValue InOp = N->getOperand(0); SDValue Idx = N->getOperand(1); SDLoc dl(N); if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) InOp = GetWidenedVector(InOp); EVT InVT = InOp.getValueType(); // Check if we can just return the input vector after widening. uint64_t IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0 && InVT == WidenVT) return InOp; // Check if we can extract from the vector. unsigned InNumElts = InVT.getVectorNumElements(); if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx); // We could try widening the input to the right length but for now, extract // the original elements, fill the rest with undefs and build a vector. SmallVector Ops(WidenNumElts); EVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); unsigned i; for (i=0; i < NumElts; ++i) Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(IdxVal+i, TLI.getVectorIdxTy())); SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { SDValue InOp = GetWidenedVector(N->getOperand(0)); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), InOp.getValueType(), InOp, N->getOperand(1), N->getOperand(2)); } SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { LoadSDNode *LD = cast(N); ISD::LoadExtType ExtType = LD->getExtensionType(); SDValue Result; SmallVector LdChain; // Chain for the series of load if (ExtType != ISD::NON_EXTLOAD) Result = GenWidenVectorExtLoads(LdChain, LD, ExtType); else Result = GenWidenVectorLoads(LdChain, LD); // If we generate a single load, we can use that for the chain. Otherwise, // build a factor node to remember the multiple loads are independent and // chain to that. SDValue NewChain; if (LdChain.size() == 1) NewChain = LdChain[0]; else NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain); // Modified the chain - switch anything that used the old chain to use // the new one. ReplaceValueWith(SDValue(N, 1), NewChain); return Result; } SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0)); SDValue Mask = N->getMask(); EVT MaskVT = Mask.getValueType(); SDValue Src0 = GetWidenedVector(N->getSrc0()); + ISD::LoadExtType ExtType = N->getExtensionType(); SDLoc dl(N); if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) Mask = GetWidenedVector(Mask); else { EVT BoolVT = getSetCCResultType(WidenVT); // We can't use ModifyToType() because we should fill the mask with // zeroes unsigned WidenNumElts = BoolVT.getVectorNumElements(); unsigned MaskNumElts = MaskVT.getVectorNumElements(); unsigned NumConcat = WidenNumElts / MaskNumElts; SmallVector Ops(NumConcat); SDValue ZeroVal = DAG.getConstant(0, MaskVT); Ops[0] = Mask; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = ZeroVal; Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); } - // Rebuild memory operand because MemoryVT was changed - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(N->getPointerInfo(), - MachineMemOperand::MOLoad, WidenVT.getStoreSize(), - N->getAlignment(), N->getAAInfo(), N->getRanges()); - SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(), - Mask, Src0, MMO); + Mask, Src0, N->getMemoryVT(), + N->getMemOperand(), ExtType); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), WidenVT, N->getOperand(0)); } SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); SDValue Cond1 = N->getOperand(0); EVT CondVT = Cond1.getValueType(); if (CondVT.isVector()) { EVT CondEltVT = CondVT.getVectorElementType(); EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenNumElts); if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector) Cond1 = GetWidenedVector(Cond1); // If we have to split the condition there is no point in widening the // select. This would result in an cycle of widening the select -> // widening the condition operand -> splitting the condition operand -> // splitting the select -> widening the select. Instead split this select // further and widen the resulting type. if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) { SDValue SplitSelect = SplitVecOp_VSELECT(N, 0); SDValue Res = ModifyToType(SplitSelect, WidenVT); return Res; } if (Cond1.getValueType() != CondWidenVT) Cond1 = ModifyToType(Cond1, CondWidenVT); } SDValue InOp1 = GetWidenedVector(N->getOperand(1)); SDValue InOp2 = GetWidenedVector(N->getOperand(2)); assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, Cond1, InOp1, InOp2); } SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) { SDValue InOp1 = GetWidenedVector(N->getOperand(2)); SDValue InOp2 = GetWidenedVector(N->getOperand(3)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), InOp1.getValueType(), N->getOperand(0), N->getOperand(1), InOp1, InOp2, N->getOperand(4)); } SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) { assert(N->getValueType(0).isVector() == N->getOperand(0).getValueType().isVector() && "Scalar/Vector type mismatch"); if (N->getValueType(0).isVector()) return WidenVecRes_VSETCC(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT, InOp1, InOp2, N->getOperand(2)); } SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getUNDEF(WidenVT); } SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) { EVT VT = N->getValueType(0); SDLoc dl(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); unsigned NumElts = VT.getVectorNumElements(); unsigned WidenNumElts = WidenVT.getVectorNumElements(); SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); // Adjust mask based on new input vector length. SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = N->getMaskElt(i); if (Idx < (int)NumElts) NewMask.push_back(Idx); else NewMask.push_back(Idx - NumElts + WidenNumElts); } for (unsigned i = NumElts; i != WidenNumElts; ++i) NewMask.push_back(-1); return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, &NewMask[0]); } SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && "Operands must be vectors"); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); SDValue InOp1 = N->getOperand(0); EVT InVT = InOp1.getValueType(); assert(InVT.isVector() && "can not widen non-vector type"); EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), WidenNumElts); InOp1 = GetWidenedVector(InOp1); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); // Assume that the input and output will be widen appropriately. If not, // we will have to unroll it at some point. assert(InOp1.getValueType() == WidenInVT && InOp2.getValueType() == WidenInVT && "Input not widened to expected type!"); (void)WidenInVT; return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT, InOp1, InOp2, N->getOperand(2)); } //===----------------------------------------------------------------------===// // Widen Vector Operand //===----------------------------------------------------------------------===// bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Widen node operand " << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); // See if the target wants to custom widen this node. if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) return false; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "WidenVectorOperand op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to widen this operator's operand!"); case ISD::BITCAST: Res = WidenVecOp_BITCAST(N); break; case ISD::CONCAT_VECTORS: Res = WidenVecOp_CONCAT_VECTORS(N); break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; + case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: Res = WidenVecOp_EXTEND(N); break; case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; } // If Res is null, the sub-method took care of registering the result. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue InOp = N->getOperand(0); // If some legalization strategy other than widening is used on the operand, // we can't safely assume that just extending the low lanes is the correct // transformation. if (getTypeAction(InOp.getValueType()) != TargetLowering::TypeWidenVector) return WidenVecOp_Convert(N); InOp = GetWidenedVector(InOp); assert(VT.getVectorNumElements() < InOp.getValueType().getVectorNumElements() && "Input wasn't widened!"); // We may need to further widen the operand until it has the same total // vector size as the result. EVT InVT = InOp.getValueType(); if (InVT.getSizeInBits() != VT.getSizeInBits()) { EVT InEltVT = InVT.getVectorElementType(); for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) { EVT FixedVT = (MVT::SimpleValueType)i; EVT FixedEltVT = FixedVT.getVectorElementType(); if (TLI.isTypeLegal(FixedVT) && FixedVT.getSizeInBits() == VT.getSizeInBits() && FixedEltVT == InEltVT) { assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() && "Not enough elements in the fixed type for the operand!"); assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() && "We can't have the same type as we started with!"); if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements()) InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT, DAG.getUNDEF(FixedVT), InOp, DAG.getConstant(0, TLI.getVectorIdxTy())); else InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp, DAG.getConstant(0, TLI.getVectorIdxTy())); break; } } InVT = InOp.getValueType(); if (InVT.getSizeInBits() != VT.getSizeInBits()) // We couldn't find a legal vector type that was a widening of the input // and could be extended in-register to the result type, so we have to // scalarize. return WidenVecOp_Convert(N); } // Use special DAG nodes to represent the operation of extending the // low lanes. switch (N->getOpcode()) { default: llvm_unreachable("Extend legalization on on extend operation!"); case ISD::ANY_EXTEND: return DAG.getAnyExtendVectorInReg(InOp, DL, VT); case ISD::SIGN_EXTEND: return DAG.getSignExtendVectorInReg(InOp, DL, VT); case ISD::ZERO_EXTEND: return DAG.getZeroExtendVectorInReg(InOp, DL, VT); } } SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { // Since the result is legal and the input is illegal, it is unlikely // that we can fix the input to a legal type so unroll the convert // into some scalar code and create a nasty build vector. EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); unsigned NumElts = VT.getVectorNumElements(); SDValue InOp = N->getOperand(0); if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) InOp = GetWidenedVector(InOp); EVT InVT = InOp.getValueType(); EVT InEltVT = InVT.getVectorElementType(); unsigned Opcode = N->getOpcode(); SmallVector Ops(NumElts); for (unsigned i=0; i < NumElts; ++i) Ops[i] = DAG.getNode(Opcode, dl, EltVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp, DAG.getConstant(i, TLI.getVectorIdxTy()))); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { EVT VT = N->getValueType(0); SDValue InOp = GetWidenedVector(N->getOperand(0)); EVT InWidenVT = InOp.getValueType(); SDLoc dl(N); // Check if we can convert between two legal vector types and extract. unsigned InWidenSize = InWidenVT.getSizeInBits(); unsigned Size = VT.getSizeInBits(); // x86mmx is not an acceptable vector element type, so don't try. if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) { unsigned NewNumElts = InWidenSize / Size; EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts); if (TLI.isTypeLegal(NewVT)) { SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp, DAG.getConstant(0, TLI.getVectorIdxTy())); } } return CreateStackStoreLoad(InOp, VT); } SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { // If the input vector is not legal, it is likely that we will not find a // legal vector of the same size. Replace the concatenate vector with a // nasty build vector. EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); unsigned NumElts = VT.getVectorNumElements(); SmallVector Ops(NumElts); EVT InVT = N->getOperand(0).getValueType(); unsigned NumInElts = InVT.getVectorNumElements(); unsigned Idx = 0; unsigned NumOperands = N->getNumOperands(); for (unsigned i=0; i < NumOperands; ++i) { SDValue InOp = N->getOperand(i); if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) InOp = GetWidenedVector(InOp); for (unsigned j=0; j < NumInElts; ++j) Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(j, TLI.getVectorIdxTy())); } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) { SDValue InOp = GetWidenedVector(N->getOperand(0)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), InOp, N->getOperand(1)); } SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDValue InOp = GetWidenedVector(N->getOperand(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), N->getValueType(0), InOp, N->getOperand(1)); } SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { // We have to widen the value but we want only to store the original // vector type. StoreSDNode *ST = cast(N); SmallVector StChain; if (ST->isTruncatingStore()) GenWidenVectorTruncStores(StChain, ST); else GenWidenVectorStores(StChain, ST); if (StChain.size() == 1) return StChain[0]; else return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); +} + +SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { + MaskedStoreSDNode *MST = cast(N); + SDValue Mask = MST->getMask(); + EVT MaskVT = Mask.getValueType(); + SDValue StVal = MST->getValue(); + // Widen the value + SDValue WideVal = GetWidenedVector(StVal); + SDLoc dl(N); + + if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) + Mask = GetWidenedVector(Mask); + else { + // The mask should be widened as well + EVT BoolVT = getSetCCResultType(WideVal.getValueType()); + // We can't use ModifyToType() because we should fill the mask with + // zeroes + unsigned WidenNumElts = BoolVT.getVectorNumElements(); + unsigned MaskNumElts = MaskVT.getVectorNumElements(); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, MaskVT); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); + } + assert(Mask.getValueType().getVectorNumElements() == + WideVal.getValueType().getVectorNumElements() && + "Mask and data vectors should have the same number of elements"); + return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(), + Mask, MST->getMemoryVT(), MST->getMemOperand(), + false); } SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { SDValue InOp0 = GetWidenedVector(N->getOperand(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(1)); SDLoc dl(N); // WARNING: In this code we widen the compare instruction with garbage. // This garbage may contain denormal floats which may be slow. Is this a real // concern ? Should we zero the unused lanes if this is a float compare ? // Get a new SETCC node to compare the newly widened operands. // Only some of the compared elements are legal. EVT SVT = TLI.getSetCCResultType(*DAG.getContext(), InOp0.getValueType()); SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N), SVT, InOp0, InOp1, N->getOperand(2)); // Extract the needed results from the result vector. EVT ResVT = EVT::getVectorVT(*DAG.getContext(), SVT.getVectorElementType(), N->getValueType(0).getVectorNumElements()); SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC, DAG.getConstant(0, TLI.getVectorIdxTy())); return PromoteTargetBoolean(CC, N->getValueType(0)); } //===----------------------------------------------------------------------===// // Vector Widening Utilities //===----------------------------------------------------------------------===// // Utility function to find the type to chop up a widen vector for load/store // TLI: Target lowering used to determine legal types. // Width: Width left need to load/store. // WidenVT: The widen vector type to load to/store from // Align: If 0, don't allow use of a wider type // WidenEx: If Align is not 0, the amount additional we can load/store from. static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, unsigned Width, EVT WidenVT, unsigned Align = 0, unsigned WidenEx = 0) { EVT WidenEltVT = WidenVT.getVectorElementType(); unsigned WidenWidth = WidenVT.getSizeInBits(); unsigned WidenEltWidth = WidenEltVT.getSizeInBits(); unsigned AlignInBits = Align*8; // If we have one element to load/store, return it. EVT RetVT = WidenEltVT; if (Width == WidenEltWidth) return RetVT; // See if there is larger legal integer than the element type to load/store unsigned VT; for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE; VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) { EVT MemVT((MVT::SimpleValueType) VT); unsigned MemVTWidth = MemVT.getSizeInBits(); if (MemVT.getSizeInBits() <= WidenEltWidth) break; if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { RetVT = MemVT; break; } } // See if there is a larger vector type to load/store that has the same vector // element type and is evenly divisible with the WidenVT. for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) { EVT MemVT = (MVT::SimpleValueType) VT; unsigned MemVTWidth = MemVT.getSizeInBits(); if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT) return MemVT; } } return RetVT; } // Builds a vector type from scalar loads // VecTy: Resulting Vector type // LDOps: Load operators to build a vector type // [Start,End) the list of loads to use. static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy, SmallVectorImpl &LdOps, unsigned Start, unsigned End) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(LdOps[Start]); EVT LdTy = LdOps[Start].getValueType(); unsigned Width = VecTy.getSizeInBits(); unsigned NumElts = Width / LdTy.getSizeInBits(); EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts); unsigned Idx = 1; SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]); for (unsigned i = Start + 1; i != End; ++i) { EVT NewLdTy = LdOps[i].getValueType(); if (NewLdTy != LdTy) { NumElts = Width / NewLdTy.getSizeInBits(); NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts); VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp); // Readjust position and vector position based on new load type Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits(); LdTy = NewLdTy; } VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i], DAG.getConstant(Idx++, TLI.getVectorIdxTy())); } return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp); } SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl &LdChain, LoadSDNode *LD) { // The strategy assumes that we can efficiently load powers of two widths. // The routines chops the vector into the largest vector loads with the same // element type or scalar loads and then recombines it to the widen vector // type. EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0)); unsigned WidenWidth = WidenVT.getSizeInBits(); EVT LdVT = LD->getMemoryVT(); SDLoc dl(LD); assert(LdVT.isVector() && WidenVT.isVector()); assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType()); // Load information SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); unsigned Align = LD->getAlignment(); bool isVolatile = LD->isVolatile(); bool isNonTemporal = LD->isNonTemporal(); bool isInvariant = LD->isInvariant(); AAMDNodes AAInfo = LD->getAAInfo(); int LdWidth = LdVT.getSizeInBits(); int WidthDiff = WidenWidth - LdWidth; // Difference unsigned LdAlign = (isVolatile) ? 0 : Align; // Allow wider loads // Find the vector type that can load from. EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff); int NewVTWidth = NewVT.getSizeInBits(); SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(), isVolatile, isNonTemporal, isInvariant, Align, AAInfo); LdChain.push_back(LdOp.getValue(1)); // Check if we can load the element with one instruction if (LdWidth <= NewVTWidth) { if (!NewVT.isVector()) { unsigned NumElts = WidenWidth / NewVTWidth; EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts); SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp); } if (NewVT == WidenVT) return LdOp; assert(WidenWidth % NewVTWidth == 0); unsigned NumConcat = WidenWidth / NewVTWidth; SmallVector ConcatOps(NumConcat); SDValue UndefVal = DAG.getUNDEF(NewVT); ConcatOps[0] = LdOp; for (unsigned i = 1; i != NumConcat; ++i) ConcatOps[i] = UndefVal; return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps); } // Load vector by using multiple loads from largest vector to scalar SmallVector LdOps; LdOps.push_back(LdOp); LdWidth -= NewVTWidth; unsigned Offset = 0; while (LdWidth > 0) { unsigned Increment = NewVTWidth / 8; Offset += Increment; BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Increment, BasePtr.getValueType())); SDValue L; if (LdWidth < NewVTWidth) { // Our current type we are using is too large, find a better size NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff); NewVTWidth = NewVT.getSizeInBits(); L = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo().getWithOffset(Offset), isVolatile, isNonTemporal, isInvariant, MinAlign(Align, Increment), AAInfo); LdChain.push_back(L.getValue(1)); if (L->getValueType(0).isVector()) { SmallVector Loads; Loads.push_back(L); unsigned size = L->getValueSizeInBits(0); while (size < LdOp->getValueSizeInBits(0)) { Loads.push_back(DAG.getUNDEF(L->getValueType(0))); size += L->getValueSizeInBits(0); } L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads); } } else { L = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo().getWithOffset(Offset), isVolatile, isNonTemporal, isInvariant, MinAlign(Align, Increment), AAInfo); LdChain.push_back(L.getValue(1)); } LdOps.push_back(L); LdWidth -= NewVTWidth; } // Build the vector from the loads operations unsigned End = LdOps.size(); if (!LdOps[0].getValueType().isVector()) // All the loads are scalar loads. return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End); // If the load contains vectors, build the vector using concat vector. // All of the vectors used to loads are power of 2 and the scalars load // can be combined to make a power of 2 vector. SmallVector ConcatOps(End); int i = End - 1; int Idx = End; EVT LdTy = LdOps[i].getValueType(); // First combine the scalar loads to a vector if (!LdTy.isVector()) { for (--i; i >= 0; --i) { LdTy = LdOps[i].getValueType(); if (LdTy.isVector()) break; } ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End); } ConcatOps[--Idx] = LdOps[i]; for (--i; i >= 0; --i) { EVT NewLdTy = LdOps[i].getValueType(); if (NewLdTy != LdTy) { // Create a larger vector ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy, makeArrayRef(&ConcatOps[Idx], End - Idx)); Idx = End - 1; LdTy = NewLdTy; } ConcatOps[--Idx] = LdOps[i]; } if (WidenWidth == LdTy.getSizeInBits()*(End - Idx)) return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, makeArrayRef(&ConcatOps[Idx], End - Idx)); // We need to fill the rest with undefs to build the vector unsigned NumOps = WidenWidth / LdTy.getSizeInBits(); SmallVector WidenOps(NumOps); SDValue UndefVal = DAG.getUNDEF(LdTy); { unsigned i = 0; for (; i != End-Idx; ++i) WidenOps[i] = ConcatOps[Idx+i]; for (; i != NumOps; ++i) WidenOps[i] = UndefVal; } return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps); } SDValue DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl &LdChain, LoadSDNode *LD, ISD::LoadExtType ExtType) { // For extension loads, it may not be more efficient to chop up the vector // and then extended it. Instead, we unroll the load and build a new vector. EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0)); EVT LdVT = LD->getMemoryVT(); SDLoc dl(LD); assert(LdVT.isVector() && WidenVT.isVector()); // Load information SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); unsigned Align = LD->getAlignment(); bool isVolatile = LD->isVolatile(); bool isNonTemporal = LD->isNonTemporal(); bool isInvariant = LD->isInvariant(); AAMDNodes AAInfo = LD->getAAInfo(); EVT EltVT = WidenVT.getVectorElementType(); EVT LdEltVT = LdVT.getVectorElementType(); unsigned NumElts = LdVT.getVectorNumElements(); // Load each element and widen unsigned WidenNumElts = WidenVT.getVectorNumElements(); SmallVector Ops(WidenNumElts); unsigned Increment = LdEltVT.getSizeInBits() / 8; Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(), LdEltVT, isVolatile, isNonTemporal, isInvariant, Align, AAInfo); LdChain.push_back(Ops[0].getValue(1)); unsigned i = 0, Offset = Increment; for (i=1; i < NumElts; ++i, Offset += Increment) { SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Offset, BasePtr.getValueType())); Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, LD->getPointerInfo().getWithOffset(Offset), LdEltVT, isVolatile, isNonTemporal, isInvariant, Align, AAInfo); LdChain.push_back(Ops[i].getValue(1)); } // Fill the rest with undefs SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i != WidenNumElts; ++i) Ops[i] = UndefVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); } void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl &StChain, StoreSDNode *ST) { // The strategy assumes that we can efficiently store powers of two widths. // The routines chops the vector into the largest vector stores with the same // element type or scalar stores. SDValue Chain = ST->getChain(); SDValue BasePtr = ST->getBasePtr(); unsigned Align = ST->getAlignment(); bool isVolatile = ST->isVolatile(); bool isNonTemporal = ST->isNonTemporal(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue ValOp = GetWidenedVector(ST->getValue()); SDLoc dl(ST); EVT StVT = ST->getMemoryVT(); unsigned StWidth = StVT.getSizeInBits(); EVT ValVT = ValOp.getValueType(); unsigned ValWidth = ValVT.getSizeInBits(); EVT ValEltVT = ValVT.getVectorElementType(); unsigned ValEltWidth = ValEltVT.getSizeInBits(); assert(StVT.getVectorElementType() == ValEltVT); int Idx = 0; // current index to store unsigned Offset = 0; // offset from base to store while (StWidth != 0) { // Find the largest vector type we can store with EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT); unsigned NewVTWidth = NewVT.getSizeInBits(); unsigned Increment = NewVTWidth / 8; if (NewVT.isVector()) { unsigned NumVTElts = NewVT.getVectorNumElements(); do { SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp, DAG.getConstant(Idx, TLI.getVectorIdxTy())); StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset), isVolatile, isNonTemporal, MinAlign(Align, Offset), AAInfo)); StWidth -= NewVTWidth; Offset += Increment; Idx += NumVTElts; BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Increment, BasePtr.getValueType())); } while (StWidth != 0 && StWidth >= NewVTWidth); } else { // Cast the vector to the scalar type we can store unsigned NumElts = ValWidth / NewVTWidth; EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts); SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp); // Readjust index position based on new vector type Idx = Idx * ValEltWidth / NewVTWidth; do { SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp, DAG.getConstant(Idx++, TLI.getVectorIdxTy())); StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset), isVolatile, isNonTemporal, MinAlign(Align, Offset), AAInfo)); StWidth -= NewVTWidth; Offset += Increment; BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Increment, BasePtr.getValueType())); } while (StWidth != 0 && StWidth >= NewVTWidth); // Restore index back to be relative to the original widen element type Idx = Idx * NewVTWidth / ValEltWidth; } } } void DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl &StChain, StoreSDNode *ST) { // For extension loads, it may not be more efficient to truncate the vector // and then store it. Instead, we extract each element and then store it. SDValue Chain = ST->getChain(); SDValue BasePtr = ST->getBasePtr(); unsigned Align = ST->getAlignment(); bool isVolatile = ST->isVolatile(); bool isNonTemporal = ST->isNonTemporal(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue ValOp = GetWidenedVector(ST->getValue()); SDLoc dl(ST); EVT StVT = ST->getMemoryVT(); EVT ValVT = ValOp.getValueType(); // It must be true that we the widen vector type is bigger than where // we need to store. assert(StVT.isVector() && ValOp.getValueType().isVector()); assert(StVT.bitsLT(ValOp.getValueType())); // For truncating stores, we can not play the tricks of chopping legal // vector types and bit cast it to the right type. Instead, we unroll // the store. EVT StEltVT = StVT.getVectorElementType(); EVT ValEltVT = ValVT.getVectorElementType(); unsigned Increment = ValEltVT.getSizeInBits() / 8; unsigned NumElts = StVT.getVectorNumElements(); SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp, DAG.getConstant(0, TLI.getVectorIdxTy())); StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo(), StEltVT, isVolatile, isNonTemporal, Align, AAInfo)); unsigned Offset = Increment; for (unsigned i=1; i < NumElts; ++i, Offset += Increment) { SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Offset, BasePtr.getValueType())); SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp, DAG.getConstant(0, TLI.getVectorIdxTy())); StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset), StEltVT, isVolatile, isNonTemporal, MinAlign(Align, Offset), AAInfo)); } } /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { // Note that InOp might have been widened so it might already have // the right width or it might need be narrowed. EVT InVT = InOp.getValueType(); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); SDLoc dl(InOp); // Check if InOp already has the right width. if (InVT == NVT) return InOp; unsigned InNumElts = InVT.getVectorNumElements(); unsigned WidenNumElts = NVT.getVectorNumElements(); if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) { unsigned NumConcat = WidenNumElts / InNumElts; SmallVector Ops(NumConcat); SDValue UndefVal = DAG.getUNDEF(InVT); Ops[0] = InOp; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = UndefVal; return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops); } if (WidenNumElts < InNumElts && InNumElts % WidenNumElts) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp, DAG.getConstant(0, TLI.getVectorIdxTy())); // Fall back to extract and build. SmallVector Ops(WidenNumElts); EVT EltVT = NVT.getVectorElementType(); unsigned MinNumElts = std::min(WidenNumElts, InNumElts); unsigned Idx; for (Idx = 0; Idx < MinNumElts; ++Idx) Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(Idx, TLI.getVectorIdxTy())); SDValue UndefVal = DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) Ops[Idx] = UndefVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); } Index: projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 279025) @@ -1,6863 +1,6864 @@ //===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This implements the SelectionDAG class. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/SelectionDAG.h" #include "SDNodeDbgValue.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include #include using namespace llvm; /// makeVTList - Return an instance of the SDVTList struct initialized with the /// specified members. static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) { SDVTList Res = {VTs, NumVTs}; return Res; } // Default null implementations of the callbacks. void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {} void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {} //===----------------------------------------------------------------------===// // ConstantFPSDNode Class //===----------------------------------------------------------------------===// /// isExactlyValue - We don't rely on operator== working on double values, as /// it returns true for things that are clearly not equal, like -0.0 and 0.0. /// As such, this method can be used to do an exact bit-for-bit comparison of /// two floating point values. bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const { return getValueAPF().bitwiseIsEqual(V); } bool ConstantFPSDNode::isValueValidForType(EVT VT, const APFloat& Val) { assert(VT.isFloatingPoint() && "Can only convert between FP types"); // convert modifies in place, so make a copy. APFloat Val2 = APFloat(Val); bool losesInfo; (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } //===----------------------------------------------------------------------===// // ISD Namespace //===----------------------------------------------------------------------===// /// isBuildVectorAllOnes - Return true if the specified node is a /// BUILD_VECTOR where all of the elements are ~0 or undef. bool ISD::isBuildVectorAllOnes(const SDNode *N) { // Look through a bit convert. while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; unsigned i = 0, e = N->getNumOperands(); // Skip over all of the undef values. while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF) ++i; // Do not accept an all-undef vector. if (i == e) return false; // Do not accept build_vectors that aren't all constants or which have non-~0 // elements. We have to be a bit careful here, as the type of the constant // may not be the same as the type of the vector elements due to type // legalization (the elements are promoted to a legal type for the target and // a vector of a type may be legal when the base element type is not). // We only want to check enough bits to cover the vector elements, because // we care if the resultant vector is all ones, not whether the individual // constants are. SDValue NotZero = N->getOperand(i); unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits(); if (ConstantSDNode *CN = dyn_cast(NotZero)) { if (CN->getAPIntValue().countTrailingOnes() < EltSize) return false; } else if (ConstantFPSDNode *CFPN = dyn_cast(NotZero)) { if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize) return false; } else return false; // Okay, we have at least one ~0 value, check to see if the rest match or are // undefs. Even with the above element type twiddling, this should be OK, as // the same type legalization should have applied to all the elements. for (++i; i != e; ++i) if (N->getOperand(i) != NotZero && N->getOperand(i).getOpcode() != ISD::UNDEF) return false; return true; } /// isBuildVectorAllZeros - Return true if the specified node is a /// BUILD_VECTOR where all of the elements are 0 or undef. bool ISD::isBuildVectorAllZeros(const SDNode *N) { // Look through a bit convert. while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; bool IsAllUndef = true; for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; IsAllUndef = false; // Do not accept build_vectors that aren't all constants or which have non-0 // elements. We have to be a bit careful here, as the type of the constant // may not be the same as the type of the vector elements due to type // legalization (the elements are promoted to a legal type for the target // and a vector of a type may be legal when the base element type is not). // We only want to check enough bits to cover the vector elements, because // we care if the resultant vector is all zeros, not whether the individual // constants are. SDValue Zero = N->getOperand(i); unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits(); if (ConstantSDNode *CN = dyn_cast(Zero)) { if (CN->getAPIntValue().countTrailingZeros() < EltSize) return false; } else if (ConstantFPSDNode *CFPN = dyn_cast(Zero)) { if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize) return false; } else return false; } // Do not accept an all-undef vector. if (IsAllUndef) return false; return true; } /// \brief Return true if the specified node is a BUILD_VECTOR node of /// all ConstantSDNode or undef. bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) { if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); if (Op.getOpcode() == ISD::UNDEF) continue; if (!isa(Op)) return false; } return true; } /// isScalarToVector - Return true if the specified node is a /// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low /// element is not an undef. bool ISD::isScalarToVector(const SDNode *N) { if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) return true; if (N->getOpcode() != ISD::BUILD_VECTOR) return false; if (N->getOperand(0).getOpcode() == ISD::UNDEF) return false; unsigned NumElems = N->getNumOperands(); if (NumElems == 1) return false; for (unsigned i = 1; i < NumElems; ++i) { SDValue V = N->getOperand(i); if (V.getOpcode() != ISD::UNDEF) return false; } return true; } /// allOperandsUndef - Return true if the node has at least one operand /// and all operands of the specified node are ISD::UNDEF. bool ISD::allOperandsUndef(const SDNode *N) { // Return false if the node has no operands. // This is "logically inconsistent" with the definition of "all" but // is probably the desired behavior. if (N->getNumOperands() == 0) return false; for (unsigned i = 0, e = N->getNumOperands(); i != e ; ++i) if (N->getOperand(i).getOpcode() != ISD::UNDEF) return false; return true; } ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) { switch (ExtType) { case ISD::EXTLOAD: return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND; case ISD::SEXTLOAD: return ISD::SIGN_EXTEND; case ISD::ZEXTLOAD: return ISD::ZERO_EXTEND; default: break; } llvm_unreachable("Invalid LoadExtType"); } /// getSetCCSwappedOperands - Return the operation corresponding to (Y op X) /// when given the operation for (X op Y). ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) { // To perform this operation, we just need to swap the L and G bits of the // operation. unsigned OldL = (Operation >> 2) & 1; unsigned OldG = (Operation >> 1) & 1; return ISD::CondCode((Operation & ~6) | // Keep the N, U, E bits (OldL << 1) | // New G bit (OldG << 2)); // New L bit. } /// getSetCCInverse - Return the operation corresponding to !(X op Y), where /// 'op' is a valid SetCC operation. ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) { unsigned Operation = Op; if (isInteger) Operation ^= 7; // Flip L, G, E bits, but not U. else Operation ^= 15; // Flip all of the condition bits. if (Operation > ISD::SETTRUE2) Operation &= ~8; // Don't let N and U bits get set. return ISD::CondCode(Operation); } /// isSignedOp - For an integer comparison, return 1 if the comparison is a /// signed operation and 2 if the result is an unsigned comparison. Return zero /// if the operation does not depend on the sign of the input (setne and seteq). static int isSignedOp(ISD::CondCode Opcode) { switch (Opcode) { default: llvm_unreachable("Illegal integer setcc operation!"); case ISD::SETEQ: case ISD::SETNE: return 0; case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE: return 1; case ISD::SETULT: case ISD::SETULE: case ISD::SETUGT: case ISD::SETUGE: return 2; } } /// getSetCCOrOperation - Return the result of a logical OR between different /// comparisons of identical values: ((X op1 Y) | (X op2 Y)). This function /// returns SETCC_INVALID if it is not possible to represent the resultant /// comparison. ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2, bool isInteger) { if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed integer setcc with an unsigned integer setcc. return ISD::SETCC_INVALID; unsigned Op = Op1 | Op2; // Combine all of the condition bits. // If the N and U bits get set then the resultant comparison DOES suddenly // care about orderedness, and is true when ordered. if (Op > ISD::SETTRUE2) Op &= ~16; // Clear the U bit if the N bit is set. // Canonicalize illegal integer setcc's. if (isInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT Op = ISD::SETNE; return ISD::CondCode(Op); } /// getSetCCAndOperation - Return the result of a logical AND between different /// comparisons of identical values: ((X op1 Y) & (X op2 Y)). This /// function returns zero if it is not possible to represent the resultant /// comparison. ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, bool isInteger) { if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed setcc with an unsigned setcc. return ISD::SETCC_INVALID; // Combine all of the condition bits. ISD::CondCode Result = ISD::CondCode(Op1 & Op2); // Canonicalize illegal integer setcc's. if (isInteger) { switch (Result) { default: break; case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT case ISD::SETOEQ: // SETEQ & SETU[LG]E case ISD::SETUEQ: Result = ISD::SETEQ ; break; // SETUGE & SETULE case ISD::SETOLT: Result = ISD::SETULT ; break; // SETULT & SETNE case ISD::SETOGT: Result = ISD::SETUGT ; break; // SETUGT & SETNE } } return Result; } //===----------------------------------------------------------------------===// // SDNode Profile Support //===----------------------------------------------------------------------===// /// AddNodeIDOpcode - Add the node opcode to the NodeID data. /// static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) { ID.AddInteger(OpC); } /// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them /// solely with their pointer. static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) { ID.AddPointer(VTList.VTs); } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. /// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef Ops) { for (auto& Op : Ops) { ID.AddPointer(Op.getNode()); ID.AddInteger(Op.getResNo()); } } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. /// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef Ops) { for (auto& Op : Ops) { ID.AddPointer(Op.getNode()); ID.AddInteger(Op.getResNo()); } } static void AddBinaryNodeIDCustom(FoldingSetNodeID &ID, bool nuw, bool nsw, bool exact) { ID.AddBoolean(nuw); ID.AddBoolean(nsw); ID.AddBoolean(exact); } /// AddBinaryNodeIDCustom - Add BinarySDNodes special infos static void AddBinaryNodeIDCustom(FoldingSetNodeID &ID, unsigned Opcode, bool nuw, bool nsw, bool exact) { if (isBinOpWithFlags(Opcode)) AddBinaryNodeIDCustom(ID, nuw, nsw, exact); } static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC, SDVTList VTList, ArrayRef OpList) { AddNodeIDOpcode(ID, OpC); AddNodeIDValueTypes(ID, VTList); AddNodeIDOperands(ID, OpList); } /// AddNodeIDCustom - If this is an SDNode with special info, add this info to /// the NodeID data. static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { switch (N->getOpcode()) { case ISD::TargetExternalSymbol: case ISD::ExternalSymbol: llvm_unreachable("Should only be used on nodes with operands"); default: break; // Normal nodes don't need extra info. case ISD::TargetConstant: case ISD::Constant: { const ConstantSDNode *C = cast(N); ID.AddPointer(C->getConstantIntValue()); ID.AddBoolean(C->isOpaque()); break; } case ISD::TargetConstantFP: case ISD::ConstantFP: { ID.AddPointer(cast(N)->getConstantFPValue()); break; } case ISD::TargetGlobalAddress: case ISD::GlobalAddress: case ISD::TargetGlobalTLSAddress: case ISD::GlobalTLSAddress: { const GlobalAddressSDNode *GA = cast(N); ID.AddPointer(GA->getGlobal()); ID.AddInteger(GA->getOffset()); ID.AddInteger(GA->getTargetFlags()); ID.AddInteger(GA->getAddressSpace()); break; } case ISD::BasicBlock: ID.AddPointer(cast(N)->getBasicBlock()); break; case ISD::Register: ID.AddInteger(cast(N)->getReg()); break; case ISD::RegisterMask: ID.AddPointer(cast(N)->getRegMask()); break; case ISD::SRCVALUE: ID.AddPointer(cast(N)->getValue()); break; case ISD::FrameIndex: case ISD::TargetFrameIndex: ID.AddInteger(cast(N)->getIndex()); break; case ISD::JumpTable: case ISD::TargetJumpTable: ID.AddInteger(cast(N)->getIndex()); ID.AddInteger(cast(N)->getTargetFlags()); break; case ISD::ConstantPool: case ISD::TargetConstantPool: { const ConstantPoolSDNode *CP = cast(N); ID.AddInteger(CP->getAlignment()); ID.AddInteger(CP->getOffset()); if (CP->isMachineConstantPoolEntry()) CP->getMachineCPVal()->addSelectionDAGCSEId(ID); else ID.AddPointer(CP->getConstVal()); ID.AddInteger(CP->getTargetFlags()); break; } case ISD::TargetIndex: { const TargetIndexSDNode *TI = cast(N); ID.AddInteger(TI->getIndex()); ID.AddInteger(TI->getOffset()); ID.AddInteger(TI->getTargetFlags()); break; } case ISD::LOAD: { const LoadSDNode *LD = cast(N); ID.AddInteger(LD->getMemoryVT().getRawBits()); ID.AddInteger(LD->getRawSubclassData()); ID.AddInteger(LD->getPointerInfo().getAddrSpace()); break; } case ISD::STORE: { const StoreSDNode *ST = cast(N); ID.AddInteger(ST->getMemoryVT().getRawBits()); ID.AddInteger(ST->getRawSubclassData()); ID.AddInteger(ST->getPointerInfo().getAddrSpace()); break; } case ISD::SDIV: case ISD::UDIV: case ISD::SRA: case ISD::SRL: case ISD::MUL: case ISD::ADD: case ISD::SUB: case ISD::SHL: { const BinaryWithFlagsSDNode *BinNode = cast(N); AddBinaryNodeIDCustom(ID, N->getOpcode(), BinNode->hasNoUnsignedWrap(), BinNode->hasNoSignedWrap(), BinNode->isExact()); break; } case ISD::ATOMIC_CMP_SWAP: case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: { const AtomicSDNode *AT = cast(N); ID.AddInteger(AT->getMemoryVT().getRawBits()); ID.AddInteger(AT->getRawSubclassData()); ID.AddInteger(AT->getPointerInfo().getAddrSpace()); break; } case ISD::PREFETCH: { const MemSDNode *PF = cast(N); ID.AddInteger(PF->getPointerInfo().getAddrSpace()); break; } case ISD::VECTOR_SHUFFLE: { const ShuffleVectorSDNode *SVN = cast(N); for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements(); i != e; ++i) ID.AddInteger(SVN->getMaskElt(i)); break; } case ISD::TargetBlockAddress: case ISD::BlockAddress: { const BlockAddressSDNode *BA = cast(N); ID.AddPointer(BA->getBlockAddress()); ID.AddInteger(BA->getOffset()); ID.AddInteger(BA->getTargetFlags()); break; } } // end switch (N->getOpcode()) // Target specific memory nodes could also have address spaces to check. if (N->isTargetMemoryOpcode()) ID.AddInteger(cast(N)->getPointerInfo().getAddrSpace()); } /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID /// data. static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) { AddNodeIDOpcode(ID, N->getOpcode()); // Add the return value info. AddNodeIDValueTypes(ID, N->getVTList()); // Add the operand info. AddNodeIDOperands(ID, N->ops()); // Handle SDNode leafs with special info. AddNodeIDCustom(ID, N); } /// encodeMemSDNodeFlags - Generic routine for computing a value for use in /// the CSE map that carries volatility, temporalness, indexing mode, and /// extension/truncation information. /// static inline unsigned encodeMemSDNodeFlags(int ConvType, ISD::MemIndexedMode AM, bool isVolatile, bool isNonTemporal, bool isInvariant) { assert((ConvType & 3) == ConvType && "ConvType may not require more than 2 bits!"); assert((AM & 7) == AM && "AM may not require more than 3 bits!"); return ConvType | (AM << 2) | (isVolatile << 5) | (isNonTemporal << 6) | (isInvariant << 7); } //===----------------------------------------------------------------------===// // SelectionDAG Class //===----------------------------------------------------------------------===// /// doNotCSE - Return true if CSE should not be performed for this node. static bool doNotCSE(SDNode *N) { if (N->getValueType(0) == MVT::Glue) return true; // Never CSE anything that produces a flag. switch (N->getOpcode()) { default: break; case ISD::HANDLENODE: case ISD::EH_LABEL: return true; // Never CSE these nodes. } // Check that remaining values produced are not flags. for (unsigned i = 1, e = N->getNumValues(); i != e; ++i) if (N->getValueType(i) == MVT::Glue) return true; // Never CSE anything that produces a flag. return false; } /// RemoveDeadNodes - This method deletes all unreachable nodes in the /// SelectionDAG. void SelectionDAG::RemoveDeadNodes() { // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted. HandleSDNode Dummy(getRoot()); SmallVector DeadNodes; // Add all obviously-dead nodes to the DeadNodes worklist. for (allnodes_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I) if (I->use_empty()) DeadNodes.push_back(I); RemoveDeadNodes(DeadNodes); // If the root changed (e.g. it was a dead load, update the root). setRoot(Dummy.getValue()); } /// RemoveDeadNodes - This method deletes the unreachable nodes in the /// given list, and any nodes that become unreachable as a result. void SelectionDAG::RemoveDeadNodes(SmallVectorImpl &DeadNodes) { // Process the worklist, deleting the nodes and adding their uses to the // worklist. while (!DeadNodes.empty()) { SDNode *N = DeadNodes.pop_back_val(); for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeDeleted(N, nullptr); // Take the node out of the appropriate CSE map. RemoveNodeFromCSEMaps(N); // Next, brutally remove the operand list. This is safe to do, as there are // no cycles in the graph. for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) { SDUse &Use = *I++; SDNode *Operand = Use.getNode(); Use.set(SDValue()); // Now that we removed this operand, see if there are no uses of it left. if (Operand->use_empty()) DeadNodes.push_back(Operand); } DeallocateNode(N); } } void SelectionDAG::RemoveDeadNode(SDNode *N){ SmallVector DeadNodes(1, N); // Create a dummy node that adds a reference to the root node, preventing // it from being deleted. (This matters if the root is an operand of the // dead node.) HandleSDNode Dummy(getRoot()); RemoveDeadNodes(DeadNodes); } void SelectionDAG::DeleteNode(SDNode *N) { // First take this out of the appropriate CSE map. RemoveNodeFromCSEMaps(N); // Finally, remove uses due to operands of this node, remove from the // AllNodes list, and delete the node. DeleteNodeNotInCSEMaps(N); } void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) { assert(N != AllNodes.begin() && "Cannot delete the entry node!"); assert(N->use_empty() && "Cannot delete a node that is not dead!"); // Drop all of the operands and decrement used node's use counts. N->DropOperands(); DeallocateNode(N); } void SDDbgInfo::erase(const SDNode *Node) { DbgValMapType::iterator I = DbgValMap.find(Node); if (I == DbgValMap.end()) return; for (auto &Val: I->second) Val->setIsInvalidated(); DbgValMap.erase(I); } void SelectionDAG::DeallocateNode(SDNode *N) { if (N->OperandsNeedDelete) delete[] N->OperandList; // Set the opcode to DELETED_NODE to help catch bugs when node // memory is reallocated. N->NodeType = ISD::DELETED_NODE; NodeAllocator.Deallocate(AllNodes.remove(N)); // If any of the SDDbgValue nodes refer to this SDNode, invalidate // them and forget about that node. DbgInfo->erase(N); } #ifndef NDEBUG /// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid. static void VerifySDNode(SDNode *N) { switch (N->getOpcode()) { default: break; case ISD::BUILD_PAIR: { EVT VT = N->getValueType(0); assert(N->getNumValues() == 1 && "Too many results!"); assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) && "Wrong return type!"); assert(N->getNumOperands() == 2 && "Wrong number of operands!"); assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() && "Mismatched operand types!"); assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() && "Wrong operand type!"); assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() && "Wrong return type size"); break; } case ISD::BUILD_VECTOR: { assert(N->getNumValues() == 1 && "Too many results!"); assert(N->getValueType(0).isVector() && "Wrong return type!"); assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() && "Wrong number of operands!"); EVT EltVT = N->getValueType(0).getVectorElementType(); for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) { assert((I->getValueType() == EltVT || (EltVT.isInteger() && I->getValueType().isInteger() && EltVT.bitsLE(I->getValueType()))) && "Wrong operand type!"); assert(I->getValueType() == N->getOperand(0).getValueType() && "Operands must all have the same type"); } break; } } } #endif // NDEBUG /// \brief Insert a newly allocated node into the DAG. /// /// Handles insertion into the all nodes list and CSE map, as well as /// verification and other common operations when a new node is allocated. void SelectionDAG::InsertNode(SDNode *N) { AllNodes.push_back(N); #ifndef NDEBUG VerifySDNode(N); #endif } /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that /// correspond to it. This is useful when we're about to delete or repurpose /// the node. We don't want future request for structurally identical nodes /// to return N anymore. bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) { bool Erased = false; switch (N->getOpcode()) { case ISD::HANDLENODE: return false; // noop. case ISD::CONDCODE: assert(CondCodeNodes[cast(N)->get()] && "Cond code doesn't exist!"); Erased = CondCodeNodes[cast(N)->get()] != nullptr; CondCodeNodes[cast(N)->get()] = nullptr; break; case ISD::ExternalSymbol: Erased = ExternalSymbols.erase(cast(N)->getSymbol()); break; case ISD::TargetExternalSymbol: { ExternalSymbolSDNode *ESN = cast(N); Erased = TargetExternalSymbols.erase( std::pair(ESN->getSymbol(), ESN->getTargetFlags())); break; } case ISD::VALUETYPE: { EVT VT = cast(N)->getVT(); if (VT.isExtended()) { Erased = ExtendedValueTypeNodes.erase(VT); } else { Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr; ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr; } break; } default: // Remove it from the CSE Map. assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!"); assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!"); Erased = CSEMap.RemoveNode(N); break; } #ifndef NDEBUG // Verify that the node was actually in one of the CSE maps, unless it has a // flag result (which cannot be CSE'd) or is one of the special cases that are // not subject to CSE. if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue && !N->isMachineOpcode() && !doNotCSE(N)) { N->dump(this); dbgs() << "\n"; llvm_unreachable("Node is not in map!"); } #endif return Erased; } /// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE /// maps and modified in place. Add it back to the CSE maps, unless an identical /// node already exists, in which case transfer all its users to the existing /// node. This transfer can potentially trigger recursive merging. /// void SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) { // For node types that aren't CSE'd, just act as if no identical node // already exists. if (!doNotCSE(N)) { SDNode *Existing = CSEMap.GetOrInsertNode(N); if (Existing != N) { // If there was already an existing matching node, use ReplaceAllUsesWith // to replace the dead one with the existing one. This can cause // recursive merging of other unrelated nodes down the line. ReplaceAllUsesWith(N, Existing); // N is now dead. Inform the listeners and delete it. for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeDeleted(N, Existing); DeleteNodeNotInCSEMaps(N); return; } } // If the node doesn't already exist, we updated it. Inform listeners. for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeUpdated(N); } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a /// node already exists with these operands, the slot will be non-null. SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos) { if (doNotCSE(N)) return nullptr; SDValue Ops[] = { Op }; FoldingSetNodeID ID; AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos); return Node; } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a /// node already exists with these operands, the slot will be non-null. SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2, void *&InsertPos) { if (doNotCSE(N)) return nullptr; SDValue Ops[] = { Op1, Op2 }; FoldingSetNodeID ID; AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos); return Node; } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a /// node already exists with these operands, the slot will be non-null. SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef Ops, void *&InsertPos) { if (doNotCSE(N)) return nullptr; FoldingSetNodeID ID; AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos); return Node; } /// getEVTAlignment - Compute the default alignment value for the /// given type. /// unsigned SelectionDAG::getEVTAlignment(EVT VT) const { Type *Ty = VT == MVT::iPTR ? PointerType::get(Type::getInt8Ty(*getContext()), 0) : VT.getTypeForEVT(*getContext()); return TLI->getDataLayout()->getABITypeAlignment(Ty); } // EntryNode could meaningfully have debug info if we can find it... SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL), EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), Root(getEntryNode()), NewNodesMustHaveLegalTypes(false), UpdateListeners(nullptr) { AllNodes.push_back(&EntryNode); DbgInfo = new SDDbgInfo(); } void SelectionDAG::init(MachineFunction &mf) { MF = &mf; TLI = getSubtarget().getTargetLowering(); TSI = getSubtarget().getSelectionDAGInfo(); Context = &mf.getFunction()->getContext(); } SelectionDAG::~SelectionDAG() { assert(!UpdateListeners && "Dangling registered DAGUpdateListeners"); allnodes_clear(); delete DbgInfo; } void SelectionDAG::allnodes_clear() { assert(&*AllNodes.begin() == &EntryNode); AllNodes.remove(AllNodes.begin()); while (!AllNodes.empty()) DeallocateNode(AllNodes.begin()); } BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2, bool nuw, bool nsw, bool exact) { if (isBinOpWithFlags(Opcode)) { BinaryWithFlagsSDNode *FN = new (NodeAllocator) BinaryWithFlagsSDNode( Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2); FN->setHasNoUnsignedWrap(nuw); FN->setHasNoSignedWrap(nsw); FN->setIsExact(exact); return FN; } BinarySDNode *N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2); return N; } void SelectionDAG::clear() { allnodes_clear(); OperandAllocator.Reset(); CSEMap.clear(); ExtendedValueTypeNodes.clear(); ExternalSymbols.clear(); TargetExternalSymbols.clear(); std::fill(CondCodeNodes.begin(), CondCodeNodes.end(), static_cast(nullptr)); std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(), static_cast(nullptr)); EntryNode.UseList = nullptr; AllNodes.push_back(&EntryNode); Root = getEntryNode(); DbgInfo->clear(); } SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : getNode(ISD::TRUNCATE, DL, VT, Op); } SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::SIGN_EXTEND, DL, VT, Op) : getNode(ISD::TRUNCATE, DL, VT, Op); } SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ZERO_EXTEND, DL, VT, Op) : getNode(ISD::TRUNCATE, DL, VT, Op); } SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT, EVT OpVT) { if (VT.bitsLE(Op.getValueType())) return getNode(ISD::TRUNCATE, SL, VT, Op); TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT); return getNode(TLI->getExtendForContent(BType), SL, VT, Op); } SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) { assert(!VT.isVector() && "getZeroExtendInReg should use the vector element type instead of " "the vector type!"); if (Op.getValueType() == VT) return Op; unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); APInt Imm = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); return getNode(ISD::AND, DL, Op.getValueType(), Op, getConstant(Imm, Op.getValueType())); } SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { assert(VT.isVector() && "This DAG node is restricted to vector types."); assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && "The sizes of the input and result must match in order to perform the " "extend in-register."); assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op); } SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { assert(VT.isVector() && "This DAG node is restricted to vector types."); assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && "The sizes of the input and result must match in order to perform the " "extend in-register."); assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op); } SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { assert(VT.isVector() && "This DAG node is restricted to vector types."); assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && "The sizes of the input and result must match in order to perform the " "extend in-register."); assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op); } /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). /// SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue NegOne = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT); return getNode(ISD::XOR, DL, VT, Val, NegOne); } SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue TrueValue; switch (TLI->getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: case TargetLowering::UndefinedBooleanContent: TrueValue = getConstant(1, VT); break; case TargetLowering::ZeroOrNegativeOneBooleanContent: TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT); break; } return getNode(ISD::XOR, DL, VT, Val, TrueValue); } SDValue SelectionDAG::getConstant(uint64_t Val, EVT VT, bool isT, bool isO) { EVT EltVT = VT.getScalarType(); assert((EltVT.getSizeInBits() >= 64 || (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) && "getConstant with a uint64_t value that doesn't fit in the type!"); return getConstant(APInt(EltVT.getSizeInBits(), Val), VT, isT, isO); } SDValue SelectionDAG::getConstant(const APInt &Val, EVT VT, bool isT, bool isO) { return getConstant(*ConstantInt::get(*Context, Val), VT, isT, isO); } SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT, bool isO) { assert(VT.isInteger() && "Cannot create FP integer constant!"); EVT EltVT = VT.getScalarType(); const ConstantInt *Elt = &Val; // In some cases the vector type is legal but the element type is illegal and // needs to be promoted, for example v8i8 on ARM. In this case, promote the // inserted value (the type does not need to match the vector element type). // Any extra bits introduced will be truncated away. if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) == TargetLowering::TypePromoteInteger) { EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT); APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits()); Elt = ConstantInt::get(*getContext(), NewVal); } // In other cases the element type is illegal and needs to be expanded, for // example v2i64 on MIPS32. In this case, find the nearest legal type, split // the value into n parts and use a vector type with n-times the elements. // Then bitcast to the type requested. // Legalizing constants too early makes the DAGCombiner's job harder so we // only legalize if the DAG tells us we must produce legal types. else if (NewNodesMustHaveLegalTypes && VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) == TargetLowering::TypeExpandInteger) { APInt NewVal = Elt->getValue(); EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT); unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits(); unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits; EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts); // Check the temporary vector is the correct size. If this fails then // getTypeToTransformTo() probably returned a type whose size (in bits) // isn't a power-of-2 factor of the requested type size. assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits()); SmallVector EltParts; for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) { EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits) .trunc(ViaEltSizeInBits), ViaEltVT, isT, isO)); } // EltParts is currently in little endian order. If we actually want // big-endian order then reverse it now. if (TLI->isBigEndian()) std::reverse(EltParts.begin(), EltParts.end()); // The elements must be reversed when the element order is different // to the endianness of the elements (because the BITCAST is itself a // vector shuffle in this situation). However, we do not need any code to // perform this reversal because getConstant() is producing a vector // splat. // This situation occurs in MIPS MSA. SmallVector Ops; for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) Ops.insert(Ops.end(), EltParts.begin(), EltParts.end()); SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT, getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT, Ops)); return Result; } assert(Elt->getBitWidth() == EltVT.getSizeInBits() && "APInt size does not match type size!"); unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(EltVT), None); ID.AddPointer(Elt); ID.AddBoolean(isO); void *IP = nullptr; SDNode *N = nullptr; if ((N = CSEMap.FindNodeOrInsertPos(ID, IP))) if (!VT.isVector()) return SDValue(N, 0); if (!N) { N = new (NodeAllocator) ConstantSDNode(isT, isO, Elt, EltVT); CSEMap.InsertNode(N, IP); InsertNode(N); } SDValue Result(N, 0); if (VT.isVector()) { SmallVector Ops; Ops.assign(VT.getVectorNumElements(), Result); Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops); } return Result; } SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) { return getConstant(Val, TLI->getPointerTy(), isTarget); } SDValue SelectionDAG::getConstantFP(const APFloat& V, EVT VT, bool isTarget) { return getConstantFP(*ConstantFP::get(*getContext(), V), VT, isTarget); } SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){ assert(VT.isFloatingPoint() && "Cannot create integer FP constant!"); EVT EltVT = VT.getScalarType(); // Do the map lookup using the actual bit pattern for the floating point // value, so that we don't have problems with 0.0 comparing equal to -0.0, and // we don't have issues with SNANs. unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(EltVT), None); ID.AddPointer(&V); void *IP = nullptr; SDNode *N = nullptr; if ((N = CSEMap.FindNodeOrInsertPos(ID, IP))) if (!VT.isVector()) return SDValue(N, 0); if (!N) { N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, EltVT); CSEMap.InsertNode(N, IP); InsertNode(N); } SDValue Result(N, 0); if (VT.isVector()) { SmallVector Ops; Ops.assign(VT.getVectorNumElements(), Result); // FIXME SDLoc info might be appropriate here Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops); } return Result; } SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) { EVT EltVT = VT.getScalarType(); if (EltVT==MVT::f32) return getConstantFP(APFloat((float)Val), VT, isTarget); else if (EltVT==MVT::f64) return getConstantFP(APFloat(Val), VT, isTarget); else if (EltVT==MVT::f80 || EltVT==MVT::f128 || EltVT==MVT::ppcf128 || EltVT==MVT::f16) { bool ignored; APFloat apf = APFloat(Val); apf.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven, &ignored); return getConstantFP(apf, VT, isTarget); } else llvm_unreachable("Unsupported type in getConstantFP"); } SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL, EVT VT, int64_t Offset, bool isTargetGA, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTargetGA) && "Cannot set target flags on target-independent globals"); // Truncate (with sign-extension) the offset value to the pointer size. unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType()); if (BitWidth < 64) Offset = SignExtend64(Offset, BitWidth); unsigned Opc; if (GV->isThreadLocal()) Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress; else Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddPointer(GV); ID.AddInteger(Offset); ID.AddInteger(TargetFlags); ID.AddInteger(GV->getType()->getAddressSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) { unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(FI); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent jump tables"); unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(JTI); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT, unsigned Alignment, int Offset, bool isTarget, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); if (Alignment == 0) Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType()); unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(Alignment); ID.AddInteger(Offset); ID.AddPointer(C); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT, unsigned Alignment, int Offset, bool isTarget, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); if (Alignment == 0) Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType()); unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(Alignment); ID.AddInteger(Offset); C->addSelectionDAGCSEId(ID); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset, unsigned char TargetFlags) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None); ID.AddInteger(Index); ID.AddInteger(Offset); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None); ID.AddPointer(MBB); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getValueType(EVT VT) { if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >= ValueTypeNodes.size()) ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1); SDNode *&N = VT.isExtended() ? ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy]; if (N) return SDValue(N, 0); N = new (NodeAllocator) VTSDNode(VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) { SDNode *&N = ExternalSymbols[Sym]; if (N) return SDValue(N, 0); N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT, unsigned char TargetFlags) { SDNode *&N = TargetExternalSymbols[std::pair(Sym, TargetFlags)]; if (N) return SDValue(N, 0); N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) { if ((unsigned)Cond >= CondCodeNodes.size()) CondCodeNodes.resize(Cond+1); if (!CondCodeNodes[Cond]) { CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond); CondCodeNodes[Cond] = N; InsertNode(N); } return SDValue(CondCodeNodes[Cond], 0); } // commuteShuffle - swaps the values of N1 and N2, and swaps all indices in // the shuffle mask M that point at N1 to point at N2, and indices that point // N2 to point at N1. static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl &M) { std::swap(N1, N2); int NElts = M.size(); for (int i = 0; i != NElts; ++i) { if (M[i] >= NElts) M[i] -= NElts; else if (M[i] >= 0) M[i] += NElts; } } SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2, const int *Mask) { assert(VT == N1.getValueType() && VT == N2.getValueType() && "Invalid VECTOR_SHUFFLE"); // Canonicalize shuffle undef, undef -> undef if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF) return getUNDEF(VT); // Validate that all indices in Mask are within the range of the elements // input to the shuffle. unsigned NElts = VT.getVectorNumElements(); SmallVector MaskVec; for (unsigned i = 0; i != NElts; ++i) { assert(Mask[i] < (int)(NElts * 2) && "Index out of range"); MaskVec.push_back(Mask[i]); } // Canonicalize shuffle v, v -> v, undef if (N1 == N2) { N2 = getUNDEF(VT); for (unsigned i = 0; i != NElts; ++i) if (MaskVec[i] >= (int)NElts) MaskVec[i] -= NElts; } // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. if (N1.getOpcode() == ISD::UNDEF) commuteShuffle(N1, N2, MaskVec); // Canonicalize all index into lhs, -> shuffle lhs, undef // Canonicalize all index into rhs, -> shuffle rhs, undef bool AllLHS = true, AllRHS = true; bool N2Undef = N2.getOpcode() == ISD::UNDEF; for (unsigned i = 0; i != NElts; ++i) { if (MaskVec[i] >= (int)NElts) { if (N2Undef) MaskVec[i] = -1; else AllLHS = false; } else if (MaskVec[i] >= 0) { AllRHS = false; } } if (AllLHS && AllRHS) return getUNDEF(VT); if (AllLHS && !N2Undef) N2 = getUNDEF(VT); if (AllRHS) { N1 = getUNDEF(VT); commuteShuffle(N1, N2, MaskVec); } // Reset our undef status after accounting for the mask. N2Undef = N2.getOpcode() == ISD::UNDEF; // Re-check whether both sides ended up undef. if (N1.getOpcode() == ISD::UNDEF && N2Undef) return getUNDEF(VT); // If Identity shuffle return that node. bool Identity = true; for (unsigned i = 0; i != NElts; ++i) { if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false; } if (Identity && NElts) return N1; // Shuffling a constant splat doesn't change the result. if (N2Undef) { SDValue V = N1; // Look through any bitcasts. We check that these don't change the number // (and size) of elements and just changes their types. while (V.getOpcode() == ISD::BITCAST) V = V->getOperand(0); // A splat should always show up as a build vector node. if (auto *BV = dyn_cast(V)) { BitVector UndefElements; SDValue Splat = BV->getSplatValue(&UndefElements); // If this is a splat of an undef, shuffling it is also undef. if (Splat && Splat.getOpcode() == ISD::UNDEF) return getUNDEF(VT); // We only have a splat which can skip shuffles if there is a splatted // value and no undef lanes rearranged by the shuffle. if (Splat && UndefElements.none()) { // Splat of , return , provided that the // number of elements match or the value splatted is a zero constant. if (V.getValueType().getVectorNumElements() == VT.getVectorNumElements()) return N1; if (auto *C = dyn_cast(Splat)) if (C->isNullValue()) return N1; } } } FoldingSetNodeID ID; SDValue Ops[2] = { N1, N2 }; AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops); for (unsigned i = 0; i != NElts; ++i) ID.AddInteger(MaskVec[i]); void* IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); // Allocate the mask array for the node out of the BumpPtrAllocator, since // SDNode doesn't have access to it. This memory will be "leaked" when // the node is deallocated, but recovered when the NodeAllocator is released. int *MaskAlloc = OperandAllocator.Allocate(NElts); memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int)); ShuffleVectorSDNode *N = new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), N1, N2, MaskAlloc); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) { MVT VT = SV.getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); SmallVector MaskVec; for (unsigned i = 0; i != NumElems; ++i) { int Idx = SV.getMaskElt(i); if (Idx >= 0) { if (Idx < (int)NumElems) Idx += NumElems; else Idx -= NumElems; } MaskVec.push_back(Idx); } SDValue Op0 = SV.getOperand(0); SDValue Op1 = SV.getOperand(1); return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, &MaskVec[0]); } SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl, SDValue Val, SDValue DTy, SDValue STy, SDValue Rnd, SDValue Sat, ISD::CvtCode Code) { // If the src and dest types are the same and the conversion is between // integer types of the same sign or two floats, no conversion is necessary. if (DTy == STy && (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF)) return Val; FoldingSetNodeID ID; SDValue Ops[] = { Val, DTy, STy, Rnd, Sat }; AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops); void* IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), Ops, Code); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::Register, getVTList(VT), None); ID.AddInteger(RegNo); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None); ID.AddPointer(RegMask); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) { FoldingSetNodeID ID; SDValue Ops[] = { Root }; AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), Ops); ID.AddPointer(Label); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(), dl.getDebugLoc(), Root, Label); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset, bool isTarget, unsigned char TargetFlags) { unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddPointer(BA); ID.AddInteger(Offset); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getSrcValue(const Value *V) { assert((!V || V->getType()->isPointerTy()) && "SrcValue is not a pointer?"); FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None); ID.AddPointer(V); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) SrcValueSDNode(V); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } /// getMDNode - Return an MDNodeSDNode which holds an MDNode. SDValue SelectionDAG::getMDNode(const MDNode *MD) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None); ID.AddPointer(MD); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) MDNodeSDNode(MD); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } /// getAddrSpaceCast - Return an AddrSpaceCastSDNode. SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS) { SDValue Ops[] = {Ptr}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops); ID.AddInteger(SrcAS); ID.AddInteger(DestAS); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) AddrSpaceCastSDNode(dl.getIROrder(), dl.getDebugLoc(), VT, Ptr, SrcAS, DestAS); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } /// getShiftAmountOperand - Return the specified value casted to /// the target's desired shift amount type. SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { EVT OpTy = Op.getValueType(); EVT ShTy = TLI->getShiftAmountTy(LHSTy); if (OpTy == ShTy || OpTy.isVector()) return Op; ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ? ISD::TRUNCATE : ISD::ZERO_EXTEND; return getNode(Opcode, SDLoc(Op), ShTy, Op); } /// CreateStackTemporary - Create a stack temporary, suitable for holding the /// specified value type. SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) { MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo(); unsigned ByteSize = VT.getStoreSize(); Type *Ty = VT.getTypeForEVT(*getContext()); unsigned StackAlign = std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty), minAlign); int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); return getFrameIndex(FrameIdx, TLI->getPointerTy()); } /// CreateStackTemporary - Create a stack temporary suitable for holding /// either of the specified value types. SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { unsigned Bytes = std::max(VT1.getStoreSizeInBits(), VT2.getStoreSizeInBits())/8; Type *Ty1 = VT1.getTypeForEVT(*getContext()); Type *Ty2 = VT2.getTypeForEVT(*getContext()); const DataLayout *TD = TLI->getDataLayout(); unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1), TD->getPrefTypeAlignment(Ty2)); MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false); return getFrameIndex(FrameIdx, TLI->getPointerTy()); } SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond, SDLoc dl) { // These setcc operations always fold. switch (Cond) { default: break; case ISD::SETFALSE: case ISD::SETFALSE2: return getConstant(0, VT); case ISD::SETTRUE: case ISD::SETTRUE2: { TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(N1->getValueType(0)); return getConstant( Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT); } case ISD::SETOEQ: case ISD::SETOGT: case ISD::SETOGE: case ISD::SETOLT: case ISD::SETOLE: case ISD::SETONE: case ISD::SETO: case ISD::SETUO: case ISD::SETUEQ: case ISD::SETUNE: assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!"); break; } if (ConstantSDNode *N2C = dyn_cast(N2.getNode())) { const APInt &C2 = N2C->getAPIntValue(); if (ConstantSDNode *N1C = dyn_cast(N1.getNode())) { const APInt &C1 = N1C->getAPIntValue(); switch (Cond) { default: llvm_unreachable("Unknown integer setcc!"); case ISD::SETEQ: return getConstant(C1 == C2, VT); case ISD::SETNE: return getConstant(C1 != C2, VT); case ISD::SETULT: return getConstant(C1.ult(C2), VT); case ISD::SETUGT: return getConstant(C1.ugt(C2), VT); case ISD::SETULE: return getConstant(C1.ule(C2), VT); case ISD::SETUGE: return getConstant(C1.uge(C2), VT); case ISD::SETLT: return getConstant(C1.slt(C2), VT); case ISD::SETGT: return getConstant(C1.sgt(C2), VT); case ISD::SETLE: return getConstant(C1.sle(C2), VT); case ISD::SETGE: return getConstant(C1.sge(C2), VT); } } } if (ConstantFPSDNode *N1C = dyn_cast(N1.getNode())) { if (ConstantFPSDNode *N2C = dyn_cast(N2.getNode())) { APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF()); switch (Cond) { default: break; case ISD::SETEQ: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, VT); case ISD::SETNE: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpLessThan, VT); case ISD::SETLT: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, VT); case ISD::SETGT: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, VT); case ISD::SETLE: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan || R==APFloat::cmpEqual, VT); case ISD::SETGE: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpEqual, VT); case ISD::SETO: return getConstant(R!=APFloat::cmpUnordered, VT); case ISD::SETUO: return getConstant(R==APFloat::cmpUnordered, VT); case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered || R==APFloat::cmpEqual, VT); case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, VT); case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered || R==APFloat::cmpLessThan, VT); case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpUnordered, VT); case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, VT); case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, VT); } } else { // Ensure that the constant occurs on the RHS. ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond); MVT CompVT = N1.getValueType().getSimpleVT(); if (!TLI->isCondCodeLegal(SwappedCond, CompVT)) return SDValue(); return getSetCC(dl, VT, N2, N1, SwappedCond); } } // Could not fold it. return SDValue(); } /// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We /// use this predicate to simplify operations downstream. bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { // This predicate is not safe for vector operations. if (Op.getValueType().isVector()) return false; unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth); } /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use /// this predicate to simplify operations downstream. Mask is known to be zero /// for bits that V cannot have. bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth) const { APInt KnownZero, KnownOne; computeKnownBits(Op, KnownZero, KnownOne, Depth); return (KnownZero & Mask) == Mask; } /// Determine which bits of Op are known to be either zero or one and return /// them in the KnownZero/KnownOne bitsets. void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, unsigned Depth) const { unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. if (Depth == 6) return; // Limit search depth. APInt KnownZero2, KnownOne2; switch (Op.getOpcode()) { case ISD::Constant: // We know all of the bits for a constant! KnownOne = cast(Op)->getAPIntValue(); KnownZero = ~KnownOne; break; case ISD::AND: // If either the LHS or the RHS are Zero, the result is zero. computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // Output known-1 bits are only known if set in both the LHS & RHS. KnownOne &= KnownOne2; // Output known-0 are known to be clear if zero in either the LHS | RHS. KnownZero |= KnownZero2; break; case ISD::OR: computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // Output known-0 bits are only known if clear in both the LHS & RHS. KnownZero &= KnownZero2; // Output known-1 are known to be set if set in either the LHS | RHS. KnownOne |= KnownOne2; break; case ISD::XOR: { computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // Output known-0 bits are known if clear or set in both the LHS & RHS. APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); // Output known-1 are known to be set if set in only one of the LHS, RHS. KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); KnownZero = KnownZeroOut; break; } case ISD::MUL: { computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // If low bits are zero in either operand, output low known-0 bits. // Also compute a conserative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. KnownOne.clearAllBits(); unsigned TrailZ = KnownZero.countTrailingOnes() + KnownZero2.countTrailingOnes(); unsigned LeadZ = std::max(KnownZero.countLeadingOnes() + KnownZero2.countLeadingOnes(), BitWidth) - BitWidth; TrailZ = std::min(TrailZ, BitWidth); LeadZ = std::min(LeadZ, BitWidth); KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | APInt::getHighBitsSet(BitWidth, LeadZ); break; } case ISD::UDIV: { // For the purposes of computing leading zeros we can conservatively // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); unsigned LeadZ = KnownZero2.countLeadingOnes(); KnownOne2.clearAllBits(); KnownZero2.clearAllBits(); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros(); if (RHSUnknownLeadingOnes != BitWidth) LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); break; } case ISD::SELECT: computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); // Only known if known in both the LHS and RHS. KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; case ISD::SELECT_CC: computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1); // Only known if known in both the LHS and RHS. KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) break; // The boolean result conforms to getBooleanContents. // If we know the result of a setcc has the top bits zero, use this info. // We know that we have an integer-based boolean since these operations // are only available for integer. if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SETCC: // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SHL: // (shl X, C1) & C2 == 0 iff (X & C2 >>u C1) == 0 if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. if (ShAmt >= BitWidth) break; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero <<= ShAmt; KnownOne <<= ShAmt; // low bits known zero. KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt); } break; case ISD::SRL: // (ushr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0 if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. if (ShAmt >= BitWidth) break; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.lshr(ShAmt); KnownOne = KnownOne.lshr(ShAmt); APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); KnownZero |= HighBits; // High bits known zero. } break; case ISD::SRA: if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. if (ShAmt >= BitWidth) break; // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.lshr(ShAmt); KnownOne = KnownOne.lshr(ShAmt); // Handle the sign bits. APInt SignBit = APInt::getSignBit(BitWidth); SignBit = SignBit.lshr(ShAmt); // Adjust to where it is now in the mask. if (KnownZero.intersects(SignBit)) { KnownZero |= HighBits; // New bits are known zero. } else if (KnownOne.intersects(SignBit)) { KnownOne |= HighBits; // New bits are known one. } } break; case ISD::SIGN_EXTEND_INREG: { EVT EVT = cast(Op.getOperand(1))->getVT(); unsigned EBits = EVT.getScalarType().getSizeInBits(); // Sign extension. Compute the demanded bits in the result that are not // present in the input. APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits); APInt InSignBit = APInt::getSignBit(EBits); APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits); // If the sign extended bits are demanded, we know that the sign // bit is demanded. InSignBit = InSignBit.zext(BitWidth); if (NewBits.getBoolValue()) InputDemandedBits |= InSignBit; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownOne &= InputDemandedBits; KnownZero &= InputDemandedBits; // If the sign bit of the input is known set or clear, then we know the // top bits of the result. if (KnownZero.intersects(InSignBit)) { // Input sign bit known clear KnownZero |= NewBits; KnownOne &= ~NewBits; } else if (KnownOne.intersects(InSignBit)) { // Input sign bit known set KnownOne |= NewBits; KnownZero &= ~NewBits; } else { // Input sign bit unknown KnownZero &= ~NewBits; KnownOne &= ~NewBits; } break; } case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: { unsigned LowBits = Log2_32(BitWidth)+1; KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); KnownOne.clearAllBits(); break; } case ISD::LOAD: { LoadSDNode *LD = cast(Op); // If this is a ZEXTLoad and we are looking at the loaded value. if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) { EVT VT = LD->getMemoryVT(); unsigned MemBits = VT.getScalarType().getSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { computeKnownBitsFromRangeMetadata(*Ranges, KnownZero); } break; } case ISD::ZERO_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); KnownZero |= NewBits; break; } case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); // Note if the sign bit is known to be zero or one. bool SignBitKnownZero = KnownZero.isNegative(); bool SignBitKnownOne = KnownOne.isNegative(); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); // If the sign bit is known zero or one, the top bits match. if (SignBitKnownZero) KnownZero |= NewBits; else if (SignBitKnownOne) KnownOne |= NewBits; break; } case ISD::ANY_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); break; } case ISD::TRUNCATE: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); KnownZero = KnownZero.zext(InBits); KnownOne = KnownOne.zext(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.trunc(BitWidth); KnownOne = KnownOne.trunc(BitWidth); break; } case ISD::AssertZext: { EVT VT = cast(Op.getOperand(1))->getVT(); APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero |= (~InMask); KnownOne &= (~KnownZero); break; } case ISD::FGETSIGN: // All bits are zero except the low bit. KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SUB: { if (ConstantSDNode *CLHS = dyn_cast(Op.getOperand(0))) { // We know that the top bits of C-X are clear if X contains less bits // than C (i.e. no wrap-around can happen). For example, 20-X is // positive if we can prove that X is >= 0 and < 16. if (CLHS->getAPIntValue().isNonNegative()) { unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros(); // NLZ can't be BitWidth with no sign bit APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); // If all of the MaskV bits are known to be zero, then we know the // output top bits are zero, because we now know that the output is // from [0-C]. if ((KnownZero2 & MaskV) == MaskV) { unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros(); // Top bits known zero. KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); } } } } // fall through case ISD::ADD: case ISD::ADDE: { // Output known-0 bits are known if clear or set in both the low clear bits // common to both LHS & RHS. For example, 8+(X<<3) is known to have the // low 3 bits clear. computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); unsigned KnownZeroOut = KnownZero2.countTrailingOnes(); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); KnownZeroOut = std::min(KnownZeroOut, KnownZero2.countTrailingOnes()); if (Op.getOpcode() == ISD::ADD) { KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut); break; } // With ADDE, a carry bit may be added in, so we can only use this // information if we know (at least) that the low two bits are clear. We // then return to the caller that the low bit is unknown but that other bits // are known zero. if (KnownZeroOut >= 2) // ADDE KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroOut); break; } case ISD::SREM: if (ConstantSDNode *Rem = dyn_cast(Op.getOperand(1))) { const APInt &RA = Rem->getAPIntValue().abs(); if (RA.isPowerOf2()) { APInt LowBits = RA - 1; computeKnownBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1); // The low bits of the first operand are unchanged by the srem. KnownZero = KnownZero2 & LowBits; KnownOne = KnownOne2 & LowBits; // If the first operand is non-negative or has all low bits zero, then // the upper bits are all zero. if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits)) KnownZero |= ~LowBits; // If the first operand is negative and not all low bits are zero, then // the upper bits are all one. if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0)) KnownOne |= ~LowBits; assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); } } break; case ISD::UREM: { if (ConstantSDNode *Rem = dyn_cast(Op.getOperand(1))) { const APInt &RA = Rem->getAPIntValue(); if (RA.isPowerOf2()) { APInt LowBits = (RA - 1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth + 1); // The upper bits are all zero, the lower ones are unchanged. KnownZero = KnownZero2 | ~LowBits; KnownOne = KnownOne2 & LowBits; break; } } // Since the result is less than or equal to either operand, any leading // zero bits in either operand must also exist in the result. computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); uint32_t Leaders = std::max(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes()); KnownOne.clearAllBits(); KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); break; } case ISD::FrameIndex: case ISD::TargetFrameIndex: if (unsigned Align = InferPtrAlignment(Op)) { // The low bits are known zero if the pointer is aligned. KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align)); break; } break; default: if (Op.getOpcode() < ISD::BUILTIN_OP_END) break; // Fallthrough case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_VOID: // Allow the target to implement this method for its nodes. TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth); break; } assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); } /// ComputeNumSignBits - Return the number of times the sign bit of the /// register is replicated into the other bits. We know that at least 1 bit /// is always equal to the sign bit (itself), but other cases can give us /// information. For example, immediately after an "SRA X, 2", we know that /// the top 3 bits are all equal to each other, so we return 3. unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{ EVT VT = Op.getValueType(); assert(VT.isInteger() && "Invalid VT!"); unsigned VTBits = VT.getScalarType().getSizeInBits(); unsigned Tmp, Tmp2; unsigned FirstAnswer = 1; if (Depth == 6) return 1; // Limit search depth. switch (Op.getOpcode()) { default: break; case ISD::AssertSext: Tmp = cast(Op.getOperand(1))->getVT().getSizeInBits(); return VTBits-Tmp+1; case ISD::AssertZext: Tmp = cast(Op.getOperand(1))->getVT().getSizeInBits(); return VTBits-Tmp; case ISD::Constant: { const APInt &Val = cast(Op)->getAPIntValue(); return Val.getNumSignBits(); } case ISD::SIGN_EXTEND: Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; case ISD::SIGN_EXTEND_INREG: // Max of the input and what this extends. Tmp = cast(Op.getOperand(1))->getVT().getScalarType().getSizeInBits(); Tmp = VTBits-Tmp+1; Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1); return std::max(Tmp, Tmp2); case ISD::SRA: Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); // SRA X, C -> adds C sign bits. if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Tmp += C->getZExtValue(); if (Tmp > VTBits) Tmp = VTBits; } return Tmp; case ISD::SHL: if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { // shl destroys sign bits. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (C->getZExtValue() >= VTBits || // Bad shift. C->getZExtValue() >= Tmp) break; // Shifted all sign bits out. return Tmp - C->getZExtValue(); } break; case ISD::AND: case ISD::OR: case ISD::XOR: // NOT is handled here. // Logical binary ops preserve the number of sign bits at the worst. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp != 1) { Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1); FirstAnswer = std::min(Tmp, Tmp2); // We computed what we know about the sign bits as our first // answer. Now proceed to the generic code that uses // computeKnownBits, and pick whichever answer is better. } break; case ISD::SELECT: Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1); if (Tmp == 1) return 1; // Early out. Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1); return std::min(Tmp, Tmp2); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) break; // The boolean result conforms to getBooleanContents. Fall through. // If setcc returns 0/-1, all bits are sign bits. // We know that we have an integer-based boolean since these operations // are only available for integer. if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrNegativeOneBooleanContent) return VTBits; break; case ISD::SETCC: // If setcc returns 0/-1, all bits are sign bits. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) return VTBits; break; case ISD::ROTL: case ISD::ROTR: if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { unsigned RotAmt = C->getZExtValue() & (VTBits-1); // Handle rotate right by N like a rotate left by 32-N. if (Op.getOpcode() == ISD::ROTR) RotAmt = (VTBits-RotAmt) & (VTBits-1); // If we aren't rotating out all of the known-in sign bits, return the // number that are left. This handles rotl(sext(x), 1) for example. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp > RotAmt+1) return Tmp-RotAmt; } break; case ISD::ADD: // Add can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp == 1) return 1; // Early out. // Special case decrementing a value (ADD X, -1): if (ConstantSDNode *CRHS = dyn_cast(Op.getOperand(1))) if (CRHS->isAllOnesValue()) { APInt KnownZero, KnownOne; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) return VTBits; // If we are subtracting one from a positive number, there is no carry // out of the result. if (KnownZero.isNegative()) return Tmp; } Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1); if (Tmp2 == 1) return 1; return std::min(Tmp, Tmp2)-1; case ISD::SUB: Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1); if (Tmp2 == 1) return 1; // Handle NEG. if (ConstantSDNode *CLHS = dyn_cast(Op.getOperand(0))) if (CLHS->isNullValue()) { APInt KnownZero, KnownOne; computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) return VTBits; // If the input is known to be positive (the sign bit is known clear), // the output of the NEG has the same number of sign bits as the input. if (KnownZero.isNegative()) return Tmp2; // Otherwise, we treat this like a SUB. } // Sub can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp == 1) return 1; // Early out. return std::min(Tmp, Tmp2)-1; case ISD::TRUNCATE: // FIXME: it's tricky to do anything useful for this, but it is an important // case for targets like X86. break; } // If we are looking at the loaded value of the SDNode. if (Op.getResNo() == 0) { // Handle LOADX separately here. EXTLOAD case will fallthrough. if (LoadSDNode *LD = dyn_cast(Op)) { unsigned ExtType = LD->getExtensionType(); switch (ExtType) { default: break; case ISD::SEXTLOAD: // '17' bits known Tmp = LD->getMemoryVT().getScalarType().getSizeInBits(); return VTBits-Tmp+1; case ISD::ZEXTLOAD: // '16' bits known Tmp = LD->getMemoryVT().getScalarType().getSizeInBits(); return VTBits-Tmp; } } } // Allow the target to implement this method for its nodes. if (Op.getOpcode() >= ISD::BUILTIN_OP_END || Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || Op.getOpcode() == ISD::INTRINSIC_VOID) { unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth); if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits); } // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. APInt KnownZero, KnownOne; computeKnownBits(Op, KnownZero, KnownOne, Depth); APInt Mask; if (KnownZero.isNegative()) { // sign bit is 0 Mask = KnownZero; } else if (KnownOne.isNegative()) { // sign bit is 1; Mask = KnownOne; } else { // Nothing known. return FirstAnswer; } // Okay, we know that the sign bit in Mask is set. Use CLZ to determine // the number of identical bits in the top of the input value. Mask = ~Mask; Mask <<= Mask.getBitWidth()-VTBits; // Return # leading zeros. We use 'min' here in case Val was zero before // shifting. We don't want to return '64' as for an i32 "0". return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros())); } /// isBaseWithConstantOffset - Return true if the specified operand is an /// ISD::ADD with a ConstantSDNode on the right-hand side, or if it is an /// ISD::OR with a ConstantSDNode that is guaranteed to have the same /// semantics as an ADD. This handles the equivalence: /// X|Cst == X+Cst iff X&Cst = 0. bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const { if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) || !isa(Op.getOperand(1))) return false; if (Op.getOpcode() == ISD::OR && !MaskedValueIsZero(Op.getOperand(0), cast(Op.getOperand(1))->getAPIntValue())) return false; return true; } bool SelectionDAG::isKnownNeverNaN(SDValue Op) const { // If we're told that NaNs won't happen, assume they won't. if (getTarget().Options.NoNaNsFPMath) return true; // If the value is a constant, we can obviously see if it is a NaN or not. if (const ConstantFPSDNode *C = dyn_cast(Op)) return !C->getValueAPF().isNaN(); // TODO: Recognize more cases here. return false; } bool SelectionDAG::isKnownNeverZero(SDValue Op) const { // If the value is a constant, we can obviously see if it is a zero or not. if (const ConstantFPSDNode *C = dyn_cast(Op)) return !C->isZero(); // TODO: Recognize more cases here. switch (Op.getOpcode()) { default: break; case ISD::OR: if (const ConstantSDNode *C = dyn_cast(Op.getOperand(1))) return !C->isNullValue(); break; } return false; } bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { // Check the obvious case. if (A == B) return true; // For for negative and positive zero. if (const ConstantFPSDNode *CA = dyn_cast(A)) if (const ConstantFPSDNode *CB = dyn_cast(B)) if (CA->isZero() && CB->isZero()) return true; // Otherwise they may not be equal. return false; } /// getNode - Gets or creates the specified node. /// SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, getVTList(VT), None); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), getVTList(VT)); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue Operand) { // Constant fold unary operations with an integer constant operand. Even // opaque constant will be folded, because the folding of unary operations // doesn't create new constants with different values. Nevertheless, the // opaque flag is preserved during folding to prevent future folding with // other constants. if (ConstantSDNode *C = dyn_cast(Operand.getNode())) { const APInt &Val = C->getAPIntValue(); switch (Opcode) { default: break; case ISD::SIGN_EXTEND: return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), VT, C->isTargetOpcode(), C->isOpaque()); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::TRUNCATE: return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), VT, C->isTargetOpcode(), C->isOpaque()); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: { APFloat apf(EVTToAPFloatSemantics(VT), APInt::getNullValue(VT.getSizeInBits())); (void)apf.convertFromAPInt(Val, Opcode==ISD::SINT_TO_FP, APFloat::rmNearestTiesToEven); return getConstantFP(apf, VT); } case ISD::BITCAST: if (VT == MVT::f16 && C->getValueType(0) == MVT::i16) return getConstantFP(APFloat(APFloat::IEEEhalf, Val), VT); if (VT == MVT::f32 && C->getValueType(0) == MVT::i32) return getConstantFP(APFloat(APFloat::IEEEsingle, Val), VT); else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64) return getConstantFP(APFloat(APFloat::IEEEdouble, Val), VT); break; case ISD::BSWAP: return getConstant(Val.byteSwap(), VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTPOP: return getConstant(Val.countPopulation(), VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return getConstant(Val.countLeadingZeros(), VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return getConstant(Val.countTrailingZeros(), VT, C->isTargetOpcode(), C->isOpaque()); } } // Constant fold unary operations with a floating point constant operand. if (ConstantFPSDNode *C = dyn_cast(Operand.getNode())) { APFloat V = C->getValueAPF(); // make copy switch (Opcode) { case ISD::FNEG: V.changeSign(); return getConstantFP(V, VT); case ISD::FABS: V.clearSign(); return getConstantFP(V, VT); case ISD::FCEIL: { APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive); if (fs == APFloat::opOK || fs == APFloat::opInexact) return getConstantFP(V, VT); break; } case ISD::FTRUNC: { APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero); if (fs == APFloat::opOK || fs == APFloat::opInexact) return getConstantFP(V, VT); break; } case ISD::FFLOOR: { APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative); if (fs == APFloat::opOK || fs == APFloat::opInexact) return getConstantFP(V, VT); break; } case ISD::FP_EXTEND: { bool ignored; // This can return overflow, underflow, or inexact; we don't care. // FIXME need to be more flexible about rounding mode. (void)V.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven, &ignored); return getConstantFP(V, VT); } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { integerPart x[2]; bool ignored; static_assert(integerPartWidth >= 64, "APFloat parts too small!"); // FIXME need to be more flexible about rounding mode. APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(), Opcode==ISD::FP_TO_SINT, APFloat::rmTowardZero, &ignored); if (s==APFloat::opInvalidOp) // inexact is OK, in fact usual break; APInt api(VT.getSizeInBits(), x); return getConstant(api, VT); } case ISD::BITCAST: if (VT == MVT::i16 && C->getValueType(0) == MVT::f16) return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), VT); else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32) return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT); else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64) return getConstant(V.bitcastToAPInt().getZExtValue(), VT); break; } } // Constant fold unary operations with a vector integer operand. if (BuildVectorSDNode *BV = dyn_cast(Operand.getNode())) { if (BV->isConstant()) { switch (Opcode) { default: // FIXME: Entirely reasonable to perform folding of other unary // operations here as the need arises. break; case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: { SmallVector Ops; for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { SDValue OpN = BV->getOperand(i); // Let the above scalar folding handle the conversion of each // element. OpN = getNode(ISD::SINT_TO_FP, DL, VT.getVectorElementType(), OpN); Ops.push_back(OpN); } return getNode(ISD::BUILD_VECTOR, DL, VT, Ops); } } } } unsigned OpOpcode = Operand.getNode()->getOpcode(); switch (Opcode) { case ISD::TokenFactor: case ISD::MERGE_VALUES: case ISD::CONCAT_VECTORS: return Operand; // Factor, merge or concat of one node? No need. case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node"); case ISD::FP_EXTEND: assert(VT.isFloatingPoint() && Operand.getValueType().isFloatingPoint() && "Invalid FP cast!"); if (Operand.getValueType() == VT) return Operand; // noop conversion. assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); if (Operand.getOpcode() == ISD::UNDEF) return getUNDEF(VT); break; case ISD::SIGN_EXTEND: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid SIGN_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) && "Invalid sext node, dst < src!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, VT); break; case ISD::ZERO_EXTEND: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid ZERO_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) && "Invalid zext node, dst < src!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, VT); break; case ISD::ANY_EXTEND: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid ANY_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) && "Invalid anyext node, dst < src!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // (ext (trunx x)) -> x if (OpOpcode == ISD::TRUNCATE) { SDValue OpOp = Operand.getNode()->getOperand(0); if (OpOp.getValueType() == VT) return OpOp; } break; case ISD::TRUNCATE: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid TRUNCATE!"); if (Operand.getValueType() == VT) return Operand; // noop truncate assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) && "Invalid truncate node, src < dst!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); if (OpOpcode == ISD::TRUNCATE) return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) { // If the source is smaller than the dest, we still need an extend. if (Operand.getNode()->getOperand(0).getValueType().getScalarType() .bitsLT(VT.getScalarType())) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT)) return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); return Operand.getNode()->getOperand(0); } if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::BITCAST: // Basic sanity checking. assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits() && "Cannot BITCAST between types of different sizes!"); if (VT == Operand.getValueType()) return Operand; // noop conversion. if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0)); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::SCALAR_TO_VECTOR: assert(VT.isVector() && !Operand.getValueType().isVector() && (VT.getVectorElementType() == Operand.getValueType() || (VT.getVectorElementType().isInteger() && Operand.getValueType().isInteger() && VT.getVectorElementType().bitsLE(Operand.getValueType()))) && "Illegal SCALAR_TO_VECTOR node!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined. if (OpOpcode == ISD::EXTRACT_VECTOR_ELT && isa(Operand.getOperand(1)) && Operand.getConstantOperandVal(1) == 0 && Operand.getOperand(0).getValueType() == VT) return Operand.getOperand(0); break; case ISD::FNEG: // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1), Operand.getNode()->getOperand(0)); if (OpOpcode == ISD::FNEG) // --X -> X return Operand.getNode()->getOperand(0); break; case ISD::FABS: if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0)); break; } SDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { // Don't CSE flag producing nodes FoldingSetNodeID ID; SDValue Ops[1] = { Operand }; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT, SDNode *Cst1, SDNode *Cst2) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); SmallVector, 4> Inputs; SmallVector Outputs; EVT SVT = VT.getScalarType(); ConstantSDNode *Scalar1 = dyn_cast(Cst1); ConstantSDNode *Scalar2 = dyn_cast(Cst2); if (Scalar1 && Scalar2 && (Scalar1->isOpaque() || Scalar2->isOpaque())) return SDValue(); if (Scalar1 && Scalar2) // Scalar instruction. Inputs.push_back(std::make_pair(Scalar1, Scalar2)); else { // For vectors extract each constant element into Inputs so we can constant // fold them individually. BuildVectorSDNode *BV1 = dyn_cast(Cst1); BuildVectorSDNode *BV2 = dyn_cast(Cst2); if (!BV1 || !BV2) return SDValue(); assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!"); for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) { ConstantSDNode *V1 = dyn_cast(BV1->getOperand(I)); ConstantSDNode *V2 = dyn_cast(BV2->getOperand(I)); if (!V1 || !V2) // Not a constant, bail. return SDValue(); if (V1->isOpaque() || V2->isOpaque()) return SDValue(); // Avoid BUILD_VECTOR nodes that perform implicit truncation. // FIXME: This is valid and could be handled by truncating the APInts. if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT) return SDValue(); Inputs.push_back(std::make_pair(V1, V2)); } } // We have a number of constant values, constant fold them element by element. for (unsigned I = 0, E = Inputs.size(); I != E; ++I) { const APInt &C1 = Inputs[I].first->getAPIntValue(); const APInt &C2 = Inputs[I].second->getAPIntValue(); switch (Opcode) { case ISD::ADD: Outputs.push_back(getConstant(C1 + C2, SVT)); break; case ISD::SUB: Outputs.push_back(getConstant(C1 - C2, SVT)); break; case ISD::MUL: Outputs.push_back(getConstant(C1 * C2, SVT)); break; case ISD::UDIV: if (!C2.getBoolValue()) return SDValue(); Outputs.push_back(getConstant(C1.udiv(C2), SVT)); break; case ISD::UREM: if (!C2.getBoolValue()) return SDValue(); Outputs.push_back(getConstant(C1.urem(C2), SVT)); break; case ISD::SDIV: if (!C2.getBoolValue()) return SDValue(); Outputs.push_back(getConstant(C1.sdiv(C2), SVT)); break; case ISD::SREM: if (!C2.getBoolValue()) return SDValue(); Outputs.push_back(getConstant(C1.srem(C2), SVT)); break; case ISD::AND: Outputs.push_back(getConstant(C1 & C2, SVT)); break; case ISD::OR: Outputs.push_back(getConstant(C1 | C2, SVT)); break; case ISD::XOR: Outputs.push_back(getConstant(C1 ^ C2, SVT)); break; case ISD::SHL: Outputs.push_back(getConstant(C1 << C2, SVT)); break; case ISD::SRL: Outputs.push_back(getConstant(C1.lshr(C2), SVT)); break; case ISD::SRA: Outputs.push_back(getConstant(C1.ashr(C2), SVT)); break; case ISD::ROTL: Outputs.push_back(getConstant(C1.rotl(C2), SVT)); break; case ISD::ROTR: Outputs.push_back(getConstant(C1.rotr(C2), SVT)); break; default: return SDValue(); } } assert((Scalar1 && Scalar2) || (VT.getVectorNumElements() == Outputs.size() && "Expected a scalar or vector!")); // Handle the scalar case first. if (!VT.isVector()) return Outputs.back(); // We may have a vector type but a scalar result. Create a splat. Outputs.resize(VT.getVectorNumElements(), Outputs.back()); // Build a big vector out of the scalar elements we generated. return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, bool nuw, bool nsw, bool exact) { ConstantSDNode *N1C = dyn_cast(N1.getNode()); ConstantSDNode *N2C = dyn_cast(N2.getNode()); switch (Opcode) { default: break; case ISD::TokenFactor: assert(VT == MVT::Other && N1.getValueType() == MVT::Other && N2.getValueType() == MVT::Other && "Invalid token factor!"); // Fold trivial token factors. if (N1.getOpcode() == ISD::EntryToken) return N2; if (N2.getOpcode() == ISD::EntryToken) return N1; if (N1 == N2) return N1; break; case ISD::CONCAT_VECTORS: // Concat of UNDEFs is UNDEF. if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF) return getUNDEF(VT); // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to // one big BUILD_VECTOR. if (N1.getOpcode() == ISD::BUILD_VECTOR && N2.getOpcode() == ISD::BUILD_VECTOR) { SmallVector Elts(N1.getNode()->op_begin(), N1.getNode()->op_end()); Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end()); return getNode(ISD::BUILD_VECTOR, DL, VT, Elts); } break; case ISD::AND: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); // (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's // worth handling here. if (N2C && N2C->isNullValue()) return N2; if (N2C && N2C->isAllOnesValue()) // X & -1 -> X return N1; break; case ISD::OR: case ISD::XOR: case ISD::ADD: case ISD::SUB: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); // (X ^|+- 0) -> X. This commonly occurs when legalizing i64 values, so // it's worth handling here. if (N2C && N2C->isNullValue()) return N1; break; case ISD::UDIV: case ISD::UREM: case ISD::MULHU: case ISD::MULHS: case ISD::MUL: case ISD::SDIV: case ISD::SREM: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); break; case ISD::FADD: case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: if (getTarget().Options.UnsafeFPMath) { if (Opcode == ISD::FADD) { // 0+x --> x if (ConstantFPSDNode *CFP = dyn_cast(N1)) if (CFP->getValueAPF().isZero()) return N2; // x+0 --> x if (ConstantFPSDNode *CFP = dyn_cast(N2)) if (CFP->getValueAPF().isZero()) return N1; } else if (Opcode == ISD::FSUB) { // x-0 --> x if (ConstantFPSDNode *CFP = dyn_cast(N2)) if (CFP->getValueAPF().isZero()) return N1; } else if (Opcode == ISD::FMUL) { ConstantFPSDNode *CFP = dyn_cast(N1); SDValue V = N2; // If the first operand isn't the constant, try the second if (!CFP) { CFP = dyn_cast(N2); V = N1; } if (CFP) { // 0*x --> 0 if (CFP->isZero()) return SDValue(CFP,0); // 1*x --> x if (CFP->isExactlyValue(1.0)) return V; } } } assert(VT.isFloatingPoint() && "This operator only applies to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); break; case ISD::FCOPYSIGN: // N1 and result must match. N1/N2 need not match. assert(N1.getValueType() == VT && N1.getValueType().isFloatingPoint() && N2.getValueType().isFloatingPoint() && "Invalid FCOPYSIGN!"); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: assert(VT == N1.getValueType() && "Shift operators return type must be the same as their first arg"); assert(VT.isInteger() && N2.getValueType().isInteger() && "Shifts only work on integers"); assert((!VT.isVector() || VT == N2.getValueType()) && "Vector shift amounts must be in the same as their first arg"); // Verify that the shift amount VT is bit enough to hold valid shift // amounts. This catches things like trying to shift an i1024 value by an // i8, which is easy to fall into in generic code that uses // TLI.getShiftAmount(). assert(N2.getValueType().getSizeInBits() >= Log2_32_Ceil(N1.getValueType().getSizeInBits()) && "Invalid use of small shift amount with oversized value!"); // Always fold shifts of i1 values so the code generator doesn't need to // handle them. Since we know the size of the shift has to be less than the // size of the value, the shift/rotate count is guaranteed to be zero. if (VT == MVT::i1) return N1; if (N2C && N2C->isNullValue()) return N1; break; case ISD::FP_ROUND_INREG: { EVT EVT = cast(N2)->getVT(); assert(VT == N1.getValueType() && "Not an inreg round!"); assert(VT.isFloatingPoint() && EVT.isFloatingPoint() && "Cannot FP_ROUND_INREG integer types"); assert(EVT.isVector() == VT.isVector() && "FP_ROUND_INREG type should be vector iff the operand " "type is vector!"); assert((!EVT.isVector() || EVT.getVectorNumElements() == VT.getVectorNumElements()) && "Vector element counts must match in FP_ROUND_INREG"); assert(EVT.bitsLE(VT) && "Not rounding down!"); (void)EVT; if (cast(N2)->getVT() == VT) return N1; // Not actually rounding. break; } case ISD::FP_ROUND: assert(VT.isFloatingPoint() && N1.getValueType().isFloatingPoint() && VT.bitsLE(N1.getValueType()) && isa(N2) && "Invalid FP_ROUND!"); if (N1.getValueType() == VT) return N1; // noop conversion. break; case ISD::AssertSext: case ISD::AssertZext: { EVT EVT = cast(N2)->getVT(); assert(VT == N1.getValueType() && "Not an inreg extend!"); assert(VT.isInteger() && EVT.isInteger() && "Cannot *_EXTEND_INREG FP types"); assert(!EVT.isVector() && "AssertSExt/AssertZExt type should be the vector element type " "rather than the vector type!"); assert(EVT.bitsLE(VT) && "Not extending!"); if (VT == EVT) return N1; // noop assertion. break; } case ISD::SIGN_EXTEND_INREG: { EVT EVT = cast(N2)->getVT(); assert(VT == N1.getValueType() && "Not an inreg extend!"); assert(VT.isInteger() && EVT.isInteger() && "Cannot *_EXTEND_INREG FP types"); assert(EVT.isVector() == VT.isVector() && "SIGN_EXTEND_INREG type should be vector iff the operand " "type is vector!"); assert((!EVT.isVector() || EVT.getVectorNumElements() == VT.getVectorNumElements()) && "Vector element counts must match in SIGN_EXTEND_INREG"); assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending if (N1C) { APInt Val = N1C->getAPIntValue(); unsigned FromBits = EVT.getScalarType().getSizeInBits(); Val <<= Val.getBitWidth()-FromBits; Val = Val.ashr(Val.getBitWidth()-FromBits); return getConstant(Val, VT); } break; } case ISD::EXTRACT_VECTOR_ELT: // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF. if (N1.getOpcode() == ISD::UNDEF) return getUNDEF(VT); // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is // expanding copies of large vectors from registers. if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) { unsigned Factor = N1.getOperand(0).getValueType().getVectorNumElements(); return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(N2C->getZExtValue() / Factor), getConstant(N2C->getZExtValue() % Factor, N2.getValueType())); } // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is // expanding large vector constants. if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) { SDValue Elt = N1.getOperand(N2C->getZExtValue()); if (VT != Elt.getValueType()) // If the vector element type is not legal, the BUILD_VECTOR operands // are promoted and implicitly truncated, and the result implicitly // extended. Make that explicit here. Elt = getAnyExtOrTrunc(Elt, DL, VT); return Elt; } // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector // operations are lowered to scalars. if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) { // If the indices are the same, return the inserted element else // if the indices are known different, extract the element from // the original vector. SDValue N1Op2 = N1.getOperand(2); ConstantSDNode *N1Op2C = dyn_cast(N1Op2.getNode()); if (N1Op2C && N2C) { if (N1Op2C->getZExtValue() == N2C->getZExtValue()) { if (VT == N1.getOperand(1).getValueType()) return N1.getOperand(1); else return getSExtOrTrunc(N1.getOperand(1), DL, VT); } return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2); } } break; case ISD::EXTRACT_ELEMENT: assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!"); assert(!N1.getValueType().isVector() && !VT.isVector() && (N1.getValueType().isInteger() == VT.isInteger()) && N1.getValueType() != VT && "Wrong types for EXTRACT_ELEMENT!"); // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding // 64-bit integers into 32-bit parts. Instead of building the extract of // the BUILD_PAIR, only to have legalize rip it apart, just do it now. if (N1.getOpcode() == ISD::BUILD_PAIR) return N1.getOperand(N2C->getZExtValue()); // EXTRACT_ELEMENT of a constant int is also very common. if (ConstantSDNode *C = dyn_cast(N1)) { unsigned ElementSize = VT.getSizeInBits(); unsigned Shift = ElementSize * N2C->getZExtValue(); APInt ShiftedVal = C->getAPIntValue().lshr(Shift); return getConstant(ShiftedVal.trunc(ElementSize), VT); } break; case ISD::EXTRACT_SUBVECTOR: { SDValue Index = N2; if (VT.isSimple() && N1.getValueType().isSimple()) { assert(VT.isVector() && N1.getValueType().isVector() && "Extract subvector VTs must be a vectors!"); assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType() && "Extract subvector VTs must have the same element type!"); assert(VT.getSimpleVT() <= N1.getSimpleValueType() && "Extract subvector must be from larger vector to smaller vector!"); if (isa(Index.getNode())) { assert((VT.getVectorNumElements() + cast(Index.getNode())->getZExtValue() <= N1.getValueType().getVectorNumElements()) && "Extract subvector overflow!"); } // Trivial extraction. if (VT.getSimpleVT() == N1.getSimpleValueType()) return N1; } break; } } // Perform trivial constant folding. if (SDValue SV = FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode())) return SV; // Canonicalize constant to RHS if commutative. if (N1C && !N2C && isCommutativeBinOp(Opcode)) { std::swap(N1C, N2C); std::swap(N1, N2); } // Constant fold FP operations. bool HasFPExceptions = TLI->hasFloatingPointExceptions(); ConstantFPSDNode *N1CFP = dyn_cast(N1.getNode()); ConstantFPSDNode *N2CFP = dyn_cast(N2.getNode()); if (N1CFP) { if (!N2CFP && isCommutativeBinOp(Opcode)) { // Canonicalize constant to RHS if commutative. std::swap(N1CFP, N2CFP); std::swap(N1, N2); } else if (N2CFP) { APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF(); APFloat::opStatus s; switch (Opcode) { case ISD::FADD: s = V1.add(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || s != APFloat::opInvalidOp) return getConstantFP(V1, VT); break; case ISD::FSUB: s = V1.subtract(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || s!=APFloat::opInvalidOp) return getConstantFP(V1, VT); break; case ISD::FMUL: s = V1.multiply(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || s!=APFloat::opInvalidOp) return getConstantFP(V1, VT); break; case ISD::FDIV: s = V1.divide(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)) { return getConstantFP(V1, VT); } break; case ISD::FREM : s = V1.mod(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)) { return getConstantFP(V1, VT); } break; case ISD::FCOPYSIGN: V1.copySign(V2); return getConstantFP(V1, VT); default: break; } } if (Opcode == ISD::FP_ROUND) { APFloat V = N1CFP->getValueAPF(); // make copy bool ignored; // This can return overflow, underflow, or inexact; we don't care. // FIXME need to be more flexible about rounding mode. (void)V.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven, &ignored); return getConstantFP(V, VT); } } // Canonicalize an UNDEF to the RHS, even over a constant. if (N1.getOpcode() == ISD::UNDEF) { if (isCommutativeBinOp(Opcode)) { std::swap(N1, N2); } else { switch (Opcode) { case ISD::FP_ROUND_INREG: case ISD::SIGN_EXTEND_INREG: case ISD::SUB: case ISD::FSUB: case ISD::FDIV: case ISD::FREM: case ISD::SRA: return N1; // fold op(undef, arg2) -> undef case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: case ISD::SRL: case ISD::SHL: if (!VT.isVector()) return getConstant(0, VT); // fold op(undef, arg2) -> 0 // For vectors, we can't easily build an all zero vector, just return // the LHS. return N2; } } } // Fold a bunch of operators when the RHS is undef. if (N2.getOpcode() == ISD::UNDEF) { switch (Opcode) { case ISD::XOR: if (N1.getOpcode() == ISD::UNDEF) // Handle undef ^ undef -> 0 special case. This is a common // idiom (misuse). return getConstant(0, VT); // fallthrough case ISD::ADD: case ISD::ADDC: case ISD::ADDE: case ISD::SUB: case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: return N2; // fold op(arg1, undef) -> undef case ISD::FADD: case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: if (getTarget().Options.UnsafeFPMath) return N2; break; case ISD::MUL: case ISD::AND: case ISD::SRL: case ISD::SHL: if (!VT.isVector()) return getConstant(0, VT); // fold op(arg1, undef) -> 0 // For vectors, we can't easily build an all zero vector, just return // the LHS. return N1; case ISD::OR: if (!VT.isVector()) return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT); // For vectors, we can't easily build an all one vector, just return // the LHS. return N1; case ISD::SRA: return N1; } } // Memoize this node if possible. BinarySDNode *N; SDVTList VTs = getVTList(VT); const bool BinOpHasFlags = isBinOpWithFlags(Opcode); if (VT != MVT::Glue) { SDValue Ops[] = {N1, N2}; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); if (BinOpHasFlags) AddBinaryNodeIDCustom(ID, Opcode, nuw, nsw, exact); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact); CSEMap.InsertNode(N, IP); } else { N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3) { // Perform various simplifications. ConstantSDNode *N1C = dyn_cast(N1.getNode()); switch (Opcode) { case ISD::FMA: { ConstantFPSDNode *N1CFP = dyn_cast(N1); ConstantFPSDNode *N2CFP = dyn_cast(N2); ConstantFPSDNode *N3CFP = dyn_cast(N3); if (N1CFP && N2CFP && N3CFP) { APFloat V1 = N1CFP->getValueAPF(); const APFloat &V2 = N2CFP->getValueAPF(); const APFloat &V3 = N3CFP->getValueAPF(); APFloat::opStatus s = V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven); if (s != APFloat::opInvalidOp) return getConstantFP(V1, VT); } break; } case ISD::CONCAT_VECTORS: // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to // one big BUILD_VECTOR. if (N1.getOpcode() == ISD::BUILD_VECTOR && N2.getOpcode() == ISD::BUILD_VECTOR && N3.getOpcode() == ISD::BUILD_VECTOR) { SmallVector Elts(N1.getNode()->op_begin(), N1.getNode()->op_end()); Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end()); Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end()); return getNode(ISD::BUILD_VECTOR, DL, VT, Elts); } break; case ISD::SETCC: { // Use FoldSetCC to simplify SETCC's. SDValue Simp = FoldSetCC(VT, N1, N2, cast(N3)->get(), DL); if (Simp.getNode()) return Simp; break; } case ISD::SELECT: if (N1C) { if (N1C->getZExtValue()) return N2; // select true, X, Y -> X return N3; // select false, X, Y -> Y } if (N2 == N3) return N2; // select C, X, X -> X break; case ISD::VECTOR_SHUFFLE: llvm_unreachable("should use getVectorShuffle constructor!"); case ISD::INSERT_SUBVECTOR: { SDValue Index = N3; if (VT.isSimple() && N1.getValueType().isSimple() && N2.getValueType().isSimple()) { assert(VT.isVector() && N1.getValueType().isVector() && N2.getValueType().isVector() && "Insert subvector VTs must be a vectors"); assert(VT == N1.getValueType() && "Dest and insert subvector source types must match!"); assert(N2.getSimpleValueType() <= N1.getSimpleValueType() && "Insert subvector must be from smaller vector to larger vector!"); if (isa(Index.getNode())) { assert((N2.getValueType().getVectorNumElements() + cast(Index.getNode())->getZExtValue() <= VT.getVectorNumElements()) && "Insert subvector overflow!"); } // Trivial insertion. if (VT.getSimpleVT() == N2.getSimpleValueType()) return N2; } break; } case ISD::BITCAST: // Fold bit_convert nodes from a type to themselves. if (N1.getValueType() == VT) return N1; break; } // Memoize node if it doesn't produce a flag. SDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { SDValue Ops[] = { N1, N2, N3 }; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4) { SDValue Ops[] = { N1, N2, N3, N4 }; return getNode(Opcode, DL, VT, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4, SDValue N5) { SDValue Ops[] = { N1, N2, N3, N4, N5 }; return getNode(Opcode, DL, VT, Ops); } /// getStackArgumentTokenFactor - Compute a TokenFactor to force all /// the incoming stack arguments to be loaded from the stack. SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) { SmallVector ArgChains; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the // CALLSEQ_BEGIN node. ArgChains.push_back(Chain); // Add a chain value for each stack argument. for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(), UE = getEntryNode().getNode()->use_end(); U != UE; ++U) if (LoadSDNode *L = dyn_cast(*U)) if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) if (FI->getIndex() < 0) ArgChains.push_back(SDValue(L, 1)); // Build a tokenfactor for all the chains. return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); } /// getMemsetValue - Vectorized representation of the memset value /// operand. static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(Value.getOpcode() != ISD::UNDEF); unsigned NumBits = VT.getScalarType().getSizeInBits(); if (ConstantSDNode *C = dyn_cast(Value)) { assert(C->getAPIntValue().getBitWidth() == 8); APInt Val = APInt::getSplat(NumBits, C->getAPIntValue()); if (VT.isInteger()) return DAG.getConstant(Val, VT); return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), VT); } Value = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Value); if (NumBits > 8) { // Use a multiplication with 0x010101... to extend the input to the // required length. APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); Value = DAG.getNode(ISD::MUL, dl, VT, Value, DAG.getConstant(Magic, VT)); } return Value; } /// getMemsetStringVal - Similar to getMemsetValue. Except this is only /// used when a memcpy is turned into a memset when the source is a constant /// string ptr. static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG, const TargetLowering &TLI, StringRef Str) { // Handle vector with all elements zero. if (Str.empty()) { if (VT.isInteger()) return DAG.getConstant(0, VT); else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128) return DAG.getConstantFP(0.0, VT); else if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getConstant(0, EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts))); } else llvm_unreachable("Expected type!"); } assert(!VT.isVector() && "Can't handle vector type here!"); unsigned NumVTBits = VT.getSizeInBits(); unsigned NumVTBytes = NumVTBits / 8; unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size())); APInt Val(NumVTBits, 0); if (TLI.isLittleEndian()) { for (unsigned i = 0; i != NumBytes; ++i) Val |= (uint64_t)(unsigned char)Str[i] << i*8; } else { for (unsigned i = 0; i != NumBytes; ++i) Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8; } // If the "cost" of materializing the integer immediate is less than the cost // of a load, then it is cost effective to turn the load into the immediate. Type *Ty = VT.getTypeForEVT(*DAG.getContext()); if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty)) return DAG.getConstant(Val, VT); return SDValue(nullptr, 0); } /// getMemBasePlusOffset - Returns base and offset node for the /// static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, SDLoc dl, SelectionDAG &DAG) { EVT VT = Base.getValueType(); return DAG.getNode(ISD::ADD, dl, VT, Base, DAG.getConstant(Offset, VT)); } /// isMemSrcFromString - Returns true if memcpy source is a string constant. /// static bool isMemSrcFromString(SDValue Src, StringRef &Str) { unsigned SrcDelta = 0; GlobalAddressSDNode *G = nullptr; if (Src.getOpcode() == ISD::GlobalAddress) G = cast(Src); else if (Src.getOpcode() == ISD::ADD && Src.getOperand(0).getOpcode() == ISD::GlobalAddress && Src.getOperand(1).getOpcode() == ISD::Constant) { G = cast(Src.getOperand(0)); SrcDelta = cast(Src.getOperand(1))->getZExtValue(); } if (!G) return false; return getConstantStringInfo(G->getGlobal(), Str, SrcDelta, false); } /// FindOptimalMemOpLowering - Determines the optimial series memory ops /// to replace the memset / memcpy. Return true if the number of memory ops /// is below the threshold. It returns the types of the sequence of /// memory ops to perform memset / memcpy by reference. static bool FindOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, bool AllowOverlap, SelectionDAG &DAG, const TargetLowering &TLI) { assert((SrcAlign == 0 || SrcAlign >= DstAlign) && "Expecting memcpy / memset source to meet alignment requirement!"); // If 'SrcAlign' is zero, that means the memory operation does not need to // load the value, i.e. memset or memcpy from constant string. Otherwise, // it's the inferred alignment of the source. 'DstAlign', on the other hand, // is the specified alignment of the memory operation. If it is zero, that // means it's possible to change the alignment of the destination. // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does // not need to be loaded. EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign, IsMemset, ZeroMemset, MemcpyStrSrc, DAG.getMachineFunction()); if (VT == MVT::Other) { unsigned AS = 0; if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment(AS) || TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) { VT = TLI.getPointerTy(); } else { switch (DstAlign & 7) { case 0: VT = MVT::i64; break; case 4: VT = MVT::i32; break; case 2: VT = MVT::i16; break; default: VT = MVT::i8; break; } } MVT LVT = MVT::i64; while (!TLI.isTypeLegal(LVT)) LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1); assert(LVT.isInteger()); if (VT.bitsGT(LVT)) VT = LVT; } unsigned NumMemOps = 0; while (Size != 0) { unsigned VTSize = VT.getSizeInBits() / 8; while (VTSize > Size) { // For now, only use non-vector load / store's for the left-over pieces. EVT NewVT = VT; unsigned NewVTSize; bool Found = false; if (VT.isVector() || VT.isFloatingPoint()) { NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32; if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) && TLI.isSafeMemOpType(NewVT.getSimpleVT())) Found = true; else if (NewVT == MVT::i64 && TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) && TLI.isSafeMemOpType(MVT::f64)) { // i64 is usually not legal on 32-bit targets, but f64 may be. NewVT = MVT::f64; Found = true; } } if (!Found) { do { NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1); if (NewVT == MVT::i8) break; } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT())); } NewVTSize = NewVT.getSizeInBits() / 8; // If the new VT cannot cover all of the remaining bits, then consider // issuing a (or a pair of) unaligned and overlapping load / store. // FIXME: Only does this for 64-bit or more since we don't have proper // cost model for unaligned load / store. bool Fast; unsigned AS = 0; if (NumMemOps && AllowOverlap && VTSize >= 8 && NewVTSize < Size && TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast) VTSize = Size; else { VT = NewVT; VTSize = NewVTSize; } } if (++NumMemOps > Limit) return false; MemOps.push_back(VT); Size -= VTSize; } return true; } static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { // Turn a memcpy of undef to nop. if (Src.getOpcode() == ISD::UNDEF) return Chain; // Expand memcpy to a series of load and store ops if the size operand falls // below a certain threshold. // TODO: In the AlwaysInline case, if the size is big then generate a loop // rather than maybe a humongous number of loads and stores. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); bool OptSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; StringRef Str; bool CopyFromStr = isMemSrcFromString(Src, Str); bool isZeroStr = CopyFromStr && Str.empty(); unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), (isZeroStr ? 0 : SrcAlign), false, false, CopyFromStr, true, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty); // Don't promote to an alignment that would require dynamic stack // realignment. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->needsStackRealignment(MF)) while (NewAlign > Align && TLI.getDataLayout()->exceedsNaturalStackAlignment(NewAlign)) NewAlign /= 2; if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) MFI->setObjectAlignment(FI->getIndex(), NewAlign); Align = NewAlign; } } SmallVector OutChains; unsigned NumMemOps = MemOps.size(); uint64_t SrcOff = 0, DstOff = 0; for (unsigned i = 0; i != NumMemOps; ++i) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value, Store; if (VTSize > Size) { // Issuing an unaligned load / store pair that overlaps with the previous // pair. Adjust the offset accordingly. assert(i == NumMemOps-1 && i != 0); SrcOff -= VTSize - Size; DstOff -= VTSize - Size; } if (CopyFromStr && (isZeroStr || (VT.isInteger() && !VT.isVector()))) { // It's unlikely a store of a vector immediate can be done in a single // instruction. It would require a load from a constantpool first. // We only handle zero vectors here. // FIXME: Handle other cases where store of vector immediate is done in // a single instruction. Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff)); if (Value.getNode()) Store = DAG.getStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), isVol, false, Align); } if (!Store.getNode()) { // The type might not be legal for the target. This should only happen // if the type is smaller than a legal type, as on PPC, so the right // thing to do is generate a LoadExt/StoreTrunc pair. These simplify // to Load/Store if NVT==VT. // FIXME does the case above also need this? EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); assert(NVT.bitsGE(VT)); Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain, getMemBasePlusOffset(Src, SrcOff, dl, DAG), SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false, false, MinAlign(SrcAlign, SrcOff)); Store = DAG.getTruncStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), VT, isVol, false, Align); } OutChains.push_back(Store); SrcOff += VTSize; DstOff += VTSize; Size -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { // Turn a memmove of undef to nop. if (Src.getOpcode() == ISD::UNDEF) return Chain; // Expand memmove to a series of load and store ops if the size operand falls // below a certain threshold. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); bool OptSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), SrcAlign, false, false, false, false, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty); if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) MFI->setObjectAlignment(FI->getIndex(), NewAlign); Align = NewAlign; } } uint64_t SrcOff = 0, DstOff = 0; SmallVector LoadValues; SmallVector LoadChains; SmallVector OutChains; unsigned NumMemOps = MemOps.size(); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value; Value = DAG.getLoad(VT, dl, Chain, getMemBasePlusOffset(Src, SrcOff, dl, DAG), SrcPtrInfo.getWithOffset(SrcOff), isVol, false, false, SrcAlign); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); SrcOff += VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); OutChains.clear(); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; SDValue Store; Store = DAG.getStore(Chain, dl, LoadValues[i], getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), isVol, false, Align); OutChains.push_back(Store); DstOff += VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } /// \brief Lower the call to 'memset' intrinsic function into a series of store /// operations. /// /// \param DAG Selection DAG where lowered code is placed. /// \param dl Link to corresponding IR location. /// \param Chain Control flow dependency. /// \param Dst Pointer to destination memory location. /// \param Src Value of byte to write into the memory. /// \param Size Number of bytes to write. /// \param Align Alignment of the destination in bytes. /// \param isVol True if destination is volatile. /// \param DstPtrInfo IR information on the memory pointer. /// \returns New head in the control flow, if lowering was successful, empty /// SDValue otherwise. /// /// The function tries to replace 'llvm.memset' intrinsic with several store /// operations and value calculation code. This is usually profitable for small /// memory size. static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo) { // Turn a memset of undef to nop. if (Src.getOpcode() == ISD::UNDEF) return Chain; // Expand memset to a series of load/store ops if the size operand // falls below a certain threshold. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); bool OptSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; bool IsZeroVal = isa(Src) && cast(Src)->isNullValue(); if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize), Size, (DstAlignCanChange ? 0 : Align), 0, true, IsZeroVal, false, true, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty); if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) MFI->setObjectAlignment(FI->getIndex(), NewAlign); Align = NewAlign; } } SmallVector OutChains; uint64_t DstOff = 0; unsigned NumMemOps = MemOps.size(); // Find the largest store and generate the bit pattern for it. EVT LargestVT = MemOps[0]; for (unsigned i = 1; i < NumMemOps; i++) if (MemOps[i].bitsGT(LargestVT)) LargestVT = MemOps[i]; SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; if (VTSize > Size) { // Issuing an unaligned load / store pair that overlaps with the previous // pair. Adjust the offset accordingly. assert(i == NumMemOps-1 && i != 0); DstOff -= VTSize - Size; } // If this store is smaller than the largest store see whether we can get // the smaller value for free with a truncate. SDValue Value = MemSetValue; if (VT.bitsLT(LargestVT)) { if (!LargestVT.isVector() && !VT.isVector() && TLI.isTruncateFree(LargestVT, VT)) Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue); else Value = getMemsetValue(Src, VT, DAG, dl); } assert(Value.getValueType() == VT && "Value with wrong type."); SDValue Store = DAG.getStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), isVol, false, Align); OutChains.push_back(Store); DstOff += VT.getSizeInBits() / 8; Size -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { assert(Align && "The SDAG layer expects explicit alignment and reserves 0"); // Check to see if we should lower the memcpy to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); if (ConstantSize) { // Memcpy with size zero? Just return the original chain. if (ConstantSize->isNullValue()) return Chain; SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),Align, isVol, false, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; } // Then check to see if we should lower the memcpy with target-specific // code. If the target chooses to do this, this is the next best. SDValue Result = TSI->EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; // If we really need inline code and the target declined to provide it, // use a (potentially long) sequence of loads and stores. if (AlwaysInline) { assert(ConstantSize && "AlwaysInline requires a constant size!"); return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Align, isVol, true, DstPtrInfo, SrcPtrInfo); } // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc // memcpy is not guaranteed to be safe. libc memcpys aren't required to // respect volatile, so they may do things like read or write memory // beyond the given memory regions. But fixing this isn't easy, and most // people don't care. // Emit a library call. TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Ty = TLI->getDataLayout()->getIntPtrType(*getContext()); Entry.Node = Dst; Args.push_back(Entry); Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), TLI->getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; } SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { assert(Align && "The SDAG layer expects explicit alignment and reserves 0"); // Check to see if we should lower the memmove to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); if (ConstantSize) { // Memmove with size zero? Just return the original chain. if (ConstantSize->isNullValue()) return Chain; SDValue Result = getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Align, isVol, false, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; } // Then check to see if we should lower the memmove with target-specific // code. If the target chooses to do this, this is the next best. SDValue Result = TSI->EmitTargetCodeForMemmove( *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; // FIXME: If the memmove is volatile, lowering it to plain libc memmove may // not be safe. See memcpy above for more details. // Emit a library call. TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Ty = TLI->getDataLayout()->getIntPtrType(*getContext()); Entry.Node = Dst; Args.push_back(Entry); Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), TLI->getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; } SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo) { assert(Align && "The SDAG layer expects explicit alignment and reserves 0"); // Check to see if we should lower the memset to stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); if (ConstantSize) { // Memset with size zero? Just return the original chain. if (ConstantSize->isNullValue()) return Chain; SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Align, isVol, DstPtrInfo); if (Result.getNode()) return Result; } // Then check to see if we should lower the memset with target-specific // code. If the target chooses to do this, this is the next best. SDValue Result = TSI->EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo); if (Result.getNode()) return Result; // Emit a library call. Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; Entry.Ty = IntPtrTy; Args.push_back(Entry); Entry.Node = Src; Entry.Ty = Src.getValueType().getTypeForEVT(*getContext()); Args.push_back(Entry); Entry.Node = Size; Entry.Ty = IntPtrTy; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), TLI->getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList, ArrayRef Ops, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { FoldingSetNodeID ID; ID.AddInteger(MemVT.getRawBits()); AddNodeIDNode(ID, Opcode, VTList, Ops); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void* IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } // Allocate the operands array for the node out of the BumpPtrAllocator, since // SDNode doesn't have access to it. This memory will be "leaked" when // the node is deallocated, but recovered when the allocator is released. // If the number of operands is less than 5 we use AtomicSDNode's internal // storage. unsigned NumOps = Ops.size(); SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate(NumOps) : nullptr; SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, MemVT, Ops.data(), DynOps, NumOps, MMO, SuccessOrdering, FailureOrdering, SynchScope); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList, ArrayRef Ops, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) { return getAtomic(Opcode, dl, MemVT, VTList, Ops, MMO, Ordering, Ordering, SynchScope); } SDValue SelectionDAG::getAtomicCmpSwap( unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo, unsigned Alignment, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { assert(Opcode == ISD::ATOMIC_CMP_SWAP || Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(MemVT); MachineFunction &MF = getMachineFunction(); // FIXME: Volatile isn't really correct; we should keep track of atomic // orderings in the memoperand. unsigned Flags = MachineMemOperand::MOVolatile; Flags |= MachineMemOperand::MOLoad; Flags |= MachineMemOperand::MOStore; MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment); return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO, SuccessOrdering, FailureOrdering, SynchScope); } SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { assert(Opcode == ISD::ATOMIC_CMP_SWAP || Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types"); SDValue Ops[] = {Chain, Ptr, Cmp, Swp}; return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, SuccessOrdering, FailureOrdering, SynchScope); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, const Value* PtrVal, unsigned Alignment, AtomicOrdering Ordering, SynchronizationScope SynchScope) { if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(MemVT); MachineFunction &MF = getMachineFunction(); // An atomic store does not load. An atomic load does not store. // (An atomicrmw obviously both loads and stores.) // For now, atomics are considered to be volatile always, and they are // chained as such. // FIXME: Volatile isn't really correct; we should keep track of atomic // orderings in the memoperand. unsigned Flags = MachineMemOperand::MOVolatile; if (Opcode != ISD::ATOMIC_STORE) Flags |= MachineMemOperand::MOLoad; if (Opcode != ISD::ATOMIC_LOAD) Flags |= MachineMemOperand::MOStore; MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags, MemVT.getStoreSize(), Alignment); return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO, Ordering, SynchScope); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) { assert((Opcode == ISD::ATOMIC_LOAD_ADD || Opcode == ISD::ATOMIC_LOAD_SUB || Opcode == ISD::ATOMIC_LOAD_AND || Opcode == ISD::ATOMIC_LOAD_OR || Opcode == ISD::ATOMIC_LOAD_XOR || Opcode == ISD::ATOMIC_LOAD_NAND || Opcode == ISD::ATOMIC_LOAD_MIN || Opcode == ISD::ATOMIC_LOAD_MAX || Opcode == ISD::ATOMIC_LOAD_UMIN || Opcode == ISD::ATOMIC_LOAD_UMAX || Opcode == ISD::ATOMIC_SWAP || Opcode == ISD::ATOMIC_STORE) && "Invalid Atomic Op"); EVT VT = Val.getValueType(); SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) : getVTList(VT, MVT::Other); SDValue Ops[] = {Chain, Ptr, Val}; return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) { assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op"); SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = {Chain, Ptr}; return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope); } /// getMergeValues - Create a MERGE_VALUES node from the given operands. SDValue SelectionDAG::getMergeValues(ArrayRef Ops, SDLoc dl) { if (Ops.size() == 1) return Ops[0]; SmallVector VTs; VTs.reserve(Ops.size()); for (unsigned i = 0; i < Ops.size(); ++i) VTs.push_back(Ops[i].getValueType()); return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops); } SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align, bool Vol, bool ReadMem, bool WriteMem, unsigned Size) { if (Align == 0) // Ensure that codegen never sees alignment 0 Align = getEVTAlignment(MemVT); MachineFunction &MF = getMachineFunction(); unsigned Flags = 0; if (WriteMem) Flags |= MachineMemOperand::MOStore; if (ReadMem) Flags |= MachineMemOperand::MOLoad; if (Vol) Flags |= MachineMemOperand::MOVolatile; if (!Size) Size = MemVT.getStoreSize(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, Size, Align); return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO); } SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachineMemOperand *MMO) { assert((Opcode == ISD::INTRINSIC_VOID || Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::PREFETCH || Opcode == ISD::LIFETIME_START || Opcode == ISD::LIFETIME_END || (Opcode <= INT_MAX && (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) && "Opcode is not a memory-accessing opcode!"); // Memoize the node unless it returns a flag. MemIntrinsicSDNode *N; if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, MemVT, MMO); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, MemVT, MMO); } InsertNode(N); return SDValue(N, 0); } /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a /// MachinePointerInfo record from it. This is particularly useful because the /// code generator has many cases where it doesn't bother passing in a /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst". static MachinePointerInfo InferPointerInfo(SDValue Ptr, int64_t Offset = 0) { // If this is FI+Offset, we can model it. if (const FrameIndexSDNode *FI = dyn_cast(Ptr)) return MachinePointerInfo::getFixedStack(FI->getIndex(), Offset); // If this is (FI+Offset1)+Offset2, we can model it. if (Ptr.getOpcode() != ISD::ADD || !isa(Ptr.getOperand(1)) || !isa(Ptr.getOperand(0))) return MachinePointerInfo(); int FI = cast(Ptr.getOperand(0))->getIndex(); return MachinePointerInfo::getFixedStack(FI, Offset+ cast(Ptr.getOperand(1))->getSExtValue()); } /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a /// MachinePointerInfo record from it. This is particularly useful because the /// code generator has many cases where it doesn't bother passing in a /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst". static MachinePointerInfo InferPointerInfo(SDValue Ptr, SDValue OffsetOp) { // If the 'Offset' value isn't a constant, we can't handle this. if (ConstantSDNode *OffsetNode = dyn_cast(OffsetOp)) return InferPointerInfo(Ptr, OffsetNode->getSExtValue()); if (OffsetOp.getOpcode() == ISD::UNDEF) return InferPointerInfo(Ptr); return MachinePointerInfo(); } SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Offset, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(VT); unsigned Flags = MachineMemOperand::MOLoad; if (isVolatile) Flags |= MachineMemOperand::MOVolatile; if (isNonTemporal) Flags |= MachineMemOperand::MONonTemporal; if (isInvariant) Flags |= MachineMemOperand::MOInvariant; // If we don't have a PtrInfo, infer the trivial frame index case to simplify // clients. if (PtrInfo.V.isNull()) PtrInfo = InferPointerInfo(Ptr, Offset); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges); return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO); } SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Offset, EVT MemVT, MachineMemOperand *MMO) { if (VT == MemVT) { ExtType = ISD::NON_EXTLOAD; } else if (ExtType == ISD::NON_EXTLOAD) { assert(VT == MemVT && "Non-extending load from different memory type!"); } else { // Extending load. assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) && "Should only be an extending load, not truncating!"); assert(VT.isInteger() == MemVT.isInteger() && "Cannot convert from FP to Int or Int -> FP!"); assert(VT.isVector() == MemVT.isVector() && "Cannot use trunc store to convert to or from a vector!"); assert((!VT.isVector() || VT.getVectorNumElements() == MemVT.getVectorNumElements()) && "Cannot use trunc store to change the number of vector elements!"); } bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.getOpcode() == ISD::UNDEF) && "Unindexed load with an offset!"); SDVTList VTs = Indexed ? getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, Ptr, Offset }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::LOAD, VTs, Ops); ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ExtType, MemVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef, PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo, Ranges); } SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef, VT, MMO); } SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo, MemVT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); } SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, MemVT, MMO); } SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM) { LoadSDNode *LD = cast(OrigLoad); assert(LD->getOffset().getOpcode() == ISD::UNDEF && "Load is already a indexed load!"); return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl, LD->getChain(), Base, Offset, LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(), false, LD->getAlignment()); } SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, unsigned Alignment, const AAMDNodes &AAInfo) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(Val.getValueType()); unsigned Flags = MachineMemOperand::MOStore; if (isVolatile) Flags |= MachineMemOperand::MOVolatile; if (isNonTemporal) Flags |= MachineMemOperand::MONonTemporal; if (PtrInfo.V.isNull()) PtrInfo = InferPointerInfo(Ptr); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, Val.getValueType().getStoreSize(), Alignment, AAInfo); return getStore(Chain, dl, Val, Ptr, MMO); } SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachineMemOperand *MMO) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); EVT VT = Val.getValueType(); SDVTList VTs = getVTList(MVT::Other); SDValue Undef = getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::STORE, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED, false, VT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT,bool isVolatile, bool isNonTemporal, unsigned Alignment, const AAMDNodes &AAInfo) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(SVT); unsigned Flags = MachineMemOperand::MOStore; if (isVolatile) Flags |= MachineMemOperand::MOVolatile; if (isNonTemporal) Flags |= MachineMemOperand::MONonTemporal; if (PtrInfo.V.isNull()) PtrInfo = InferPointerInfo(Ptr); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, SVT.getStoreSize(), Alignment, AAInfo); return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO); } SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, EVT SVT, MachineMemOperand *MMO) { EVT VT = Val.getValueType(); assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (VT == SVT) return getStore(Chain, dl, Val, Ptr, MMO); assert(SVT.getScalarType().bitsLT(VT.getScalarType()) && "Should only be a truncating store, not extending!"); assert(VT.isInteger() == SVT.isInteger() && "Can't do FP-INT conversion!"); assert(VT.isVector() == SVT.isVector() && "Cannot use trunc store to convert to or from a vector!"); assert((!VT.isVector() || VT.getVectorNumElements() == SVT.getVectorNumElements()) && "Cannot use trunc store to change the number of vector elements!"); SDVTList VTs = getVTList(MVT::Other); SDValue Undef = getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::STORE, VTs, Ops); ID.AddInteger(SVT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED, true, SVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM) { StoreSDNode *ST = cast(OrigStore); assert(ST->getOffset().getOpcode() == ISD::UNDEF && "Store is already a indexed store!"); SDVTList VTs = getVTList(Base.getValueType(), MVT::Other); SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::STORE, VTs, Ops); ID.AddInteger(ST->getMemoryVT().getRawBits()); ID.AddInteger(ST->getRawSubclassData()); ID.AddInteger(ST->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ST->isTruncatingStore(), ST->getMemoryVT(), ST->getMemOperand()); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, - SDValue Ptr, SDValue Mask, SDValue Src0, - MachineMemOperand *MMO) { + SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT, + MachineMemOperand *MMO, ISD::LoadExtType ExtTy) { SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Src0 }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops); ID.AddInteger(VT.getRawBits()); - ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED, + ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, 4, VTs, - VT, MMO); + ExtTy, MemVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, - SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) { + SDValue Ptr, SDValue Mask, EVT MemVT, + MachineMemOperand *MMO, bool isTrunc) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); EVT VT = Val.getValueType(); SDVTList VTs = getVTList(MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Val }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, 4, - VTs, VT, MMO); + VTs, isTrunc, MemVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue SV, unsigned Align) { SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, MVT::i32) }; return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops) { switch (Ops.size()) { case 0: return getNode(Opcode, DL, VT); case 1: return getNode(Opcode, DL, VT, static_cast(Ops[0])); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; } // Copy from an SDUse array into an SDValue array for use with // the regular getNode logic. SmallVector NewOps(Ops.begin(), Ops.end()); return getNode(Opcode, DL, VT, NewOps); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops) { unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); case 1: return getNode(Opcode, DL, VT, Ops[0]); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; } switch (Opcode) { default: break; case ISD::SELECT_CC: { assert(NumOps == 5 && "SELECT_CC takes 5 operands!"); assert(Ops[0].getValueType() == Ops[1].getValueType() && "LHS and RHS of condition must have same type!"); assert(Ops[2].getValueType() == Ops[3].getValueType() && "True and False arms of SelectCC must have same type!"); assert(Ops[2].getValueType() == VT && "select_cc node must be of same type as true and false value!"); break; } case ISD::BR_CC: { assert(NumOps == 5 && "BR_CC takes 5 operands!"); assert(Ops[2].getValueType() == Ops[3].getValueType() && "LHS/RHS of comparison should match types!"); break; } } // Memoize nodes. SDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, ArrayRef ResultTys, ArrayRef Ops) { return getNode(Opcode, DL, getVTList(ResultTys), Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, ArrayRef Ops) { if (VTList.NumVTs == 1) return getNode(Opcode, DL, VTList.VTs[0], Ops); #if 0 switch (Opcode) { // FIXME: figure out how to safely handle things like // int foo(int x) { return 1 << (x & 255); } // int bar() { return foo(256); } case ISD::SRA_PARTS: case ISD::SRL_PARTS: case ISD::SHL_PARTS: if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG && cast(N3.getOperand(1))->getVT() != MVT::i1) return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); else if (N3.getOpcode() == ISD::AND) if (ConstantSDNode *AndRHS = dyn_cast(N3.getOperand(1))) { // If the and is only masking out bits that cannot effect the shift, // eliminate the and. unsigned NumBits = VT.getScalarType().getSizeInBits()*2; if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1) return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); } break; } #endif // Memoize the node unless it returns a flag. SDNode *N; unsigned NumOps = Ops.size(); if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); if (NumOps == 1) { N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]); } else if (NumOps == 2) { N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]); } else if (NumOps == 3) { N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1], Ops[2]); } else { N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops); } CSEMap.InsertNode(N, IP); } else { if (NumOps == 1) { N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]); } else if (NumOps == 2) { N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]); } else if (NumOps == 3) { N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1], Ops[2]); } else { N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops); } } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) { return getNode(Opcode, DL, VTList, None); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1) { SDValue Ops[] = { N1 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2) { SDValue Ops[] = { N1, N2 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2, SDValue N3) { SDValue Ops[] = { N1, N2, N3 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2, SDValue N3, SDValue N4) { SDValue Ops[] = { N1, N2, N3, N4 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2, SDValue N3, SDValue N4, SDValue N5) { SDValue Ops[] = { N1, N2, N3, N4, N5 }; return getNode(Opcode, DL, VTList, Ops); } SDVTList SelectionDAG::getVTList(EVT VT) { return makeVTList(SDNode::getValueTypeList(VT), 1); } SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) { FoldingSetNodeID ID; ID.AddInteger(2U); ID.AddInteger(VT1.getRawBits()); ID.AddInteger(VT2.getRawBits()); void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(2); Array[0] = VT1; Array[1] = VT2; Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) { FoldingSetNodeID ID; ID.AddInteger(3U); ID.AddInteger(VT1.getRawBits()); ID.AddInteger(VT2.getRawBits()); ID.AddInteger(VT3.getRawBits()); void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(3); Array[0] = VT1; Array[1] = VT2; Array[2] = VT3; Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) { FoldingSetNodeID ID; ID.AddInteger(4U); ID.AddInteger(VT1.getRawBits()); ID.AddInteger(VT2.getRawBits()); ID.AddInteger(VT3.getRawBits()); ID.AddInteger(VT4.getRawBits()); void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(4); Array[0] = VT1; Array[1] = VT2; Array[2] = VT3; Array[3] = VT4; Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } SDVTList SelectionDAG::getVTList(ArrayRef VTs) { unsigned NumVTs = VTs.size(); FoldingSetNodeID ID; ID.AddInteger(NumVTs); for (unsigned index = 0; index < NumVTs; index++) { ID.AddInteger(VTs[index].getRawBits()); } void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(NumVTs); std::copy(VTs.begin(), VTs.end(), Array); Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } /// UpdateNodeOperands - *Mutate* the specified node in-place to have the /// specified operands. If the resultant node already exists in the DAG, /// this does not modify the specified node, instead it returns the node that /// already exists. If the resultant node does not exist in the DAG, the /// input node is returned. As a degenerate case, if you specify the same /// input operands as the node already has, the input node is returned. SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) { assert(N->getNumOperands() == 1 && "Update with wrong number of operands"); // Check to see if there is no change. if (Op == N->getOperand(0)) return N; // See if the modified node already exists. void *InsertPos = nullptr; if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos)) return Existing; // Nope it doesn't. Remove the node from its current place in the maps. if (InsertPos) if (!RemoveNodeFromCSEMaps(N)) InsertPos = nullptr; // Now we update the operands. N->OperandList[0].set(Op); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; } SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) { assert(N->getNumOperands() == 2 && "Update with wrong number of operands"); // Check to see if there is no change. if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1)) return N; // No operands changed, just return the input node. // See if the modified node already exists. void *InsertPos = nullptr; if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos)) return Existing; // Nope it doesn't. Remove the node from its current place in the maps. if (InsertPos) if (!RemoveNodeFromCSEMaps(N)) InsertPos = nullptr; // Now we update the operands. if (N->OperandList[0] != Op1) N->OperandList[0].set(Op1); if (N->OperandList[1] != Op2) N->OperandList[1].set(Op2); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) { SDValue Ops[] = { Op1, Op2, Op3 }; return UpdateNodeOperands(N, Ops); } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3, SDValue Op4) { SDValue Ops[] = { Op1, Op2, Op3, Op4 }; return UpdateNodeOperands(N, Ops); } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3, SDValue Op4, SDValue Op5) { SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 }; return UpdateNodeOperands(N, Ops); } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, ArrayRef Ops) { unsigned NumOps = Ops.size(); assert(N->getNumOperands() == NumOps && "Update with wrong number of operands"); // Check to see if there is no change. bool AnyChange = false; for (unsigned i = 0; i != NumOps; ++i) { if (Ops[i] != N->getOperand(i)) { AnyChange = true; break; } } // No operands changed, just return the input node. if (!AnyChange) return N; // See if the modified node already exists. void *InsertPos = nullptr; if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos)) return Existing; // Nope it doesn't. Remove the node from its current place in the maps. if (InsertPos) if (!RemoveNodeFromCSEMaps(N)) InsertPos = nullptr; // Now we update the operands. for (unsigned i = 0; i != NumOps; ++i) if (N->OperandList[i] != Ops[i]) N->OperandList[i].set(Ops[i]); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; } /// DropOperands - Release the operands and set this node to have /// zero operands. void SDNode::DropOperands() { // Unlike the code in MorphNodeTo that does this, we don't need to // watch for dead nodes here. for (op_iterator I = op_begin(), E = op_end(); I != E; ) { SDUse &Use = *I++; Use.set(SDValue()); } } /// SelectNodeTo - These are wrappers around MorphNodeTo that accept a /// machine opcode. /// SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT) { SDVTList VTs = getVTList(VT); return SelectNodeTo(N, MachineOpc, VTs, None); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2, Op3 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, ArrayRef Ops) { SDVTList VTs = getVTList(VT); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2) { SDVTList VTs = getVTList(VT1, VT2); return SelectNodeTo(N, MachineOpc, VTs, None); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, EVT VT4, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3, VT4); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, SDValue Op1) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2, Op3 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2, VT3); SDValue Ops[] = { Op1, Op2, Op3 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, SDVTList VTs,ArrayRef Ops) { N = MorphNodeTo(N, ~MachineOpc, VTs, Ops); // Reset the NodeID to -1. N->setNodeId(-1); return N; } /// UpdadeSDLocOnMergedSDNode - If the opt level is -O0 then it throws away /// the line number information on the merged node since it is not possible to /// preserve the information that operation is associated with multiple lines. /// This will make the debugger working better at -O0, were there is a higher /// probability having other instructions associated with that line. /// /// For IROrder, we keep the smaller of the two SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) { DebugLoc NLoc = N->getDebugLoc(); if (!(NLoc.isUnknown()) && (OptLevel == CodeGenOpt::None) && (OLoc.getDebugLoc() != NLoc)) { N->setDebugLoc(DebugLoc()); } unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder()); N->setIROrder(Order); return N; } /// MorphNodeTo - This *mutates* the specified node to have the specified /// return type, opcode, and operands. /// /// Note that MorphNodeTo returns the resultant node. If there is already a /// node of the specified opcode and operands, it returns that node instead of /// the current one. Note that the SDLoc need not be the same. /// /// Using MorphNodeTo is faster than creating a new node and swapping it in /// with ReplaceAllUsesWith both because it often avoids allocating a new /// node, and because it doesn't require CSE recalculation for any of /// the node's users. /// /// However, note that MorphNodeTo recursively deletes dead nodes from the DAG. /// As a consequence it isn't appropriate to use from within the DAG combiner or /// the legalizer which maintain worklists that would need to be updated when /// deleting things. SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef Ops) { unsigned NumOps = Ops.size(); // If an identical node already exists, use it. void *IP = nullptr; if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, VTs, Ops); if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP)) return UpdadeSDLocOnMergedSDNode(ON, SDLoc(N)); } if (!RemoveNodeFromCSEMaps(N)) IP = nullptr; // Start the morphing. N->NodeType = Opc; N->ValueList = VTs.VTs; N->NumValues = VTs.NumVTs; // Clear the operands list, updating used nodes to remove this from their // use list. Keep track of any operands that become dead as a result. SmallPtrSet DeadNodeSet; for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) { SDUse &Use = *I++; SDNode *Used = Use.getNode(); Use.set(SDValue()); if (Used->use_empty()) DeadNodeSet.insert(Used); } if (MachineSDNode *MN = dyn_cast(N)) { // Initialize the memory references information. MN->setMemRefs(nullptr, nullptr); // If NumOps is larger than the # of operands we can have in a // MachineSDNode, reallocate the operand list. if (NumOps > MN->NumOperands || !MN->OperandsNeedDelete) { if (MN->OperandsNeedDelete) delete[] MN->OperandList; if (NumOps > array_lengthof(MN->LocalOperands)) // We're creating a final node that will live unmorphed for the // remainder of the current SelectionDAG iteration, so we can allocate // the operands directly out of a pool with no recycling metadata. MN->InitOperands(OperandAllocator.Allocate(NumOps), Ops.data(), NumOps); else MN->InitOperands(MN->LocalOperands, Ops.data(), NumOps); MN->OperandsNeedDelete = false; } else MN->InitOperands(MN->OperandList, Ops.data(), NumOps); } else { // If NumOps is larger than the # of operands we currently have, reallocate // the operand list. if (NumOps > N->NumOperands) { if (N->OperandsNeedDelete) delete[] N->OperandList; N->InitOperands(new SDUse[NumOps], Ops.data(), NumOps); N->OperandsNeedDelete = true; } else N->InitOperands(N->OperandList, Ops.data(), NumOps); } // Delete any nodes that are still dead after adding the uses for the // new operands. if (!DeadNodeSet.empty()) { SmallVector DeadNodes; for (SDNode *N : DeadNodeSet) if (N->use_empty()) DeadNodes.push_back(N); RemoveDeadNodes(DeadNodes); } if (IP) CSEMap.InsertNode(N, IP); // Memoize the new node. return N; } /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. /// /// Note that getMachineNode returns the resultant node. If there is already a /// node of the specified opcode and operands, it returns that node instead of /// the current one. MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT) { SDVTList VTs = getVTList(VT); return getMachineNode(Opcode, dl, VTs, None); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2, Op3 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, ArrayRef Ops) { SDVTList VTs = getVTList(VT); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2) { SDVTList VTs = getVTList(VT1, VT2); return getMachineNode(Opcode, dl, VTs, None); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2, Op3 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT1, VT2, VT3); SDValue Ops[] = { Op1, Op2 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2, VT3); SDValue Ops[] = { Op1, Op2, Op3 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, EVT VT4, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3, VT4); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, ArrayRef ResultTys, ArrayRef Ops) { SDVTList VTs = getVTList(ResultTys); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs, ArrayRef OpsArray) { bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue; MachineSDNode *N; void *IP = nullptr; const SDValue *Ops = OpsArray.data(); unsigned NumOps = OpsArray.size(); if (DoCSE) { FoldingSetNodeID ID; AddNodeIDNode(ID, ~Opcode, VTs, OpsArray); IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { return cast(UpdadeSDLocOnMergedSDNode(E, DL)); } } // Allocate a new MachineSDNode. N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); // Initialize the operands list. if (NumOps > array_lengthof(N->LocalOperands)) // We're creating a final node that will live unmorphed for the // remainder of the current SelectionDAG iteration, so we can allocate // the operands directly out of a pool with no recycling metadata. N->InitOperands(OperandAllocator.Allocate(NumOps), Ops, NumOps); else N->InitOperands(N->LocalOperands, Ops, NumOps); N->OperandsNeedDelete = false; if (DoCSE) CSEMap.InsertNode(N, IP); InsertNode(N); return N; } /// getTargetExtractSubreg - A convenience function for creating /// TargetOpcode::EXTRACT_SUBREG nodes. SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, SDLoc DL, EVT VT, SDValue Operand) { SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32); SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, Operand, SRIdxVal); return SDValue(Subreg, 0); } /// getTargetInsertSubreg - A convenience function for creating /// TargetOpcode::INSERT_SUBREG nodes. SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT, SDValue Operand, SDValue Subreg) { SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32); SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, Operand, Subreg, SRIdxVal); return SDValue(Result, 0); } /// getNodeIfExists - Get the specified node if it's already available, or /// else return NULL. SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef Ops, bool nuw, bool nsw, bool exact) { if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); if (isBinOpWithFlags(Opcode)) AddBinaryNodeIDCustom(ID, nuw, nsw, exact); void *IP = nullptr; if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) return E; } return nullptr; } /// getDbgValue - Creates a SDDbgValue node. /// /// SDNode SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool IsIndirect, uint64_t Off, DebugLoc DL, unsigned O) { return new (Allocator) SDDbgValue(Var, Expr, N, R, IsIndirect, Off, DL, O); } /// Constant SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t Off, DebugLoc DL, unsigned O) { return new (Allocator) SDDbgValue(Var, Expr, C, Off, DL, O); } /// FrameIndex SDDbgValue *SelectionDAG::getFrameIndexDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t Off, DebugLoc DL, unsigned O) { return new (Allocator) SDDbgValue(Var, Expr, FI, Off, DL, O); } namespace { /// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node /// pointed to by a use iterator is deleted, increment the use iterator /// so that it doesn't dangle. /// class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener { SDNode::use_iterator &UI; SDNode::use_iterator &UE; void NodeDeleted(SDNode *N, SDNode *E) override { // Increment the iterator as needed. while (UI != UE && N == *UI) ++UI; } public: RAUWUpdateListener(SelectionDAG &d, SDNode::use_iterator &ui, SDNode::use_iterator &ue) : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {} }; } /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. /// /// This version assumes From has a single result value. /// void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) { SDNode *From = FromN.getNode(); assert(From->getNumValues() == 1 && FromN.getResNo() == 0 && "Cannot replace with this method!"); assert(From != To.getNode() && "Cannot replace uses of with self"); // Iterate over all the existing uses of From. New uses will be added // to the beginning of the use list, which we avoid visiting. // This specifically avoids visiting uses of From that arise while the // replacement is happening, because any such uses would be the result // of CSE: If an existing node looks like From after one of its operands // is replaced by To, we don't want to replace of all its users with To // too. See PR3018 for more info. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); ++UI; Use.set(To); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (FromN == getRoot()) setRoot(To); } /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. /// /// This version assumes that for each value of From, there is a /// corresponding value in To in the same position with the same type. /// void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) { #ifndef NDEBUG for (unsigned i = 0, e = From->getNumValues(); i != e; ++i) assert((!From->hasAnyUseOfValue(i) || From->getValueType(i) == To->getValueType(i)) && "Cannot use this version of ReplaceAllUsesWith!"); #endif // Handle the trivial case. if (From == To) return; // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); ++UI; Use.setNode(To); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (From == getRoot().getNode()) setRoot(SDValue(To, getRoot().getResNo())); } /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. /// /// This version can replace From with any result values. To must match the /// number and types of values returned by From. void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) { if (From->getNumValues() == 1) // Handle the simple case efficiently. return ReplaceAllUsesWith(SDValue(From, 0), To[0]); // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); const SDValue &ToOp = To[Use.getResNo()]; ++UI; Use.set(ToOp); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (From == getRoot().getNode()) setRoot(SDValue(To[getRoot().getResNo()])); } /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The Deleted /// vector is handled the same way as for ReplaceAllUsesWith. void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){ // Handle the really simple, really trivial case efficiently. if (From == To) return; // Handle the simple, trivial, case efficiently. if (From.getNode()->getNumValues() == 1) { ReplaceAllUsesWith(From, To); return; } // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From.getNode()->use_begin(), UE = From.getNode()->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; bool UserRemovedFromCSEMaps = false; // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); // Skip uses of different values from the same node. if (Use.getResNo() != From.getResNo()) { ++UI; continue; } // If this node hasn't been modified yet, it's still in the CSE maps, // so remove its old self from the CSE maps. if (!UserRemovedFromCSEMaps) { RemoveNodeFromCSEMaps(User); UserRemovedFromCSEMaps = true; } ++UI; Use.set(To); } while (UI != UE && *UI == User); // We are iterating over all uses of the From node, so if a use // doesn't use the specific value, no changes are made. if (!UserRemovedFromCSEMaps) continue; // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (From == getRoot()) setRoot(To); } namespace { /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith /// to record information about a use. struct UseMemo { SDNode *User; unsigned Index; SDUse *Use; }; /// operator< - Sort Memos by User. bool operator<(const UseMemo &L, const UseMemo &R) { return (intptr_t)L.User < (intptr_t)R.User; } } /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The same value /// may appear in both the From and To list. The Deleted vector is /// handled the same way as for ReplaceAllUsesWith. void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To, unsigned Num){ // Handle the simple, trivial case efficiently. if (Num == 1) return ReplaceAllUsesOfValueWith(*From, *To); // Read up all the uses and make records of them. This helps // processing new uses that are introduced during the // replacement process. SmallVector Uses; for (unsigned i = 0; i != Num; ++i) { unsigned FromResNo = From[i].getResNo(); SDNode *FromNode = From[i].getNode(); for (SDNode::use_iterator UI = FromNode->use_begin(), E = FromNode->use_end(); UI != E; ++UI) { SDUse &Use = UI.getUse(); if (Use.getResNo() == FromResNo) { UseMemo Memo = { *UI, i, &Use }; Uses.push_back(Memo); } } } // Sort the uses, so that all the uses from a given User are together. std::sort(Uses.begin(), Uses.end()); for (unsigned UseIndex = 0, UseIndexEnd = Uses.size(); UseIndex != UseIndexEnd; ) { // We know that this user uses some value of From. If it is the right // value, update it. SDNode *User = Uses[UseIndex].User; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // The Uses array is sorted, so all the uses for a given User // are next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { unsigned i = Uses[UseIndex].Index; SDUse &Use = *Uses[UseIndex].Use; ++UseIndex; Use.set(To[i]); } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } } /// AssignTopologicalOrder - Assign a unique node id for each node in the DAG /// based on their topological order. It returns the maximum id and a vector /// of the SDNodes* in assigned order by reference. unsigned SelectionDAG::AssignTopologicalOrder() { unsigned DAGSize = 0; // SortedPos tracks the progress of the algorithm. Nodes before it are // sorted, nodes after it are unsorted. When the algorithm completes // it is at the end of the list. allnodes_iterator SortedPos = allnodes_begin(); // Visit all the nodes. Move nodes with no operands to the front of // the list immediately. Annotate nodes that do have operands with their // operand count. Before we do this, the Node Id fields of the nodes // may contain arbitrary values. After, the Node Id fields for nodes // before SortedPos will contain the topological sort index, and the // Node Id fields for nodes At SortedPos and after will contain the // count of outstanding operands. for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) { SDNode *N = I++; checkForCycles(N, this); unsigned Degree = N->getNumOperands(); if (Degree == 0) { // A node with no uses, add it to the result array immediately. N->setNodeId(DAGSize++); allnodes_iterator Q = N; if (Q != SortedPos) SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q)); assert(SortedPos != AllNodes.end() && "Overran node list"); ++SortedPos; } else { // Temporarily use the Node Id as scratch space for the degree count. N->setNodeId(Degree); } } // Visit all the nodes. As we iterate, move nodes into sorted order, // such that by the time the end is reached all nodes will be sorted. for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) { SDNode *N = I; checkForCycles(N, this); // N is in sorted position, so all its uses have one less operand // that needs to be sorted. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDNode *P = *UI; unsigned Degree = P->getNodeId(); assert(Degree != 0 && "Invalid node degree"); --Degree; if (Degree == 0) { // All of P's operands are sorted, so P may sorted now. P->setNodeId(DAGSize++); if (P != SortedPos) SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P)); assert(SortedPos != AllNodes.end() && "Overran node list"); ++SortedPos; } else { // Update P's outstanding operand count. P->setNodeId(Degree); } } if (I == SortedPos) { #ifndef NDEBUG SDNode *S = ++I; dbgs() << "Overran sorted position:\n"; S->dumprFull(this); dbgs() << "\n"; dbgs() << "Checking if this is due to cycles\n"; checkForCycles(this, true); #endif llvm_unreachable(nullptr); } } assert(SortedPos == AllNodes.end() && "Topological sort incomplete!"); assert(AllNodes.front().getOpcode() == ISD::EntryToken && "First node in topological sort is not the entry token!"); assert(AllNodes.front().getNodeId() == 0 && "First node in topological sort has non-zero id!"); assert(AllNodes.front().getNumOperands() == 0 && "First node in topological sort has operands!"); assert(AllNodes.back().getNodeId() == (int)DAGSize-1 && "Last node in topologic sort has unexpected id!"); assert(AllNodes.back().use_empty() && "Last node in topologic sort has users!"); assert(DAGSize == allnodes_size() && "Node count mismatch!"); return DAGSize; } /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the /// value is produced by SD. void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) { if (SD) { assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue()); SD->setHasDebugValue(true); } DbgInfo->add(DB, SD, isParameter); } /// TransferDbgValues - Transfer SDDbgValues. void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) { if (From == To || !From.getNode()->getHasDebugValue()) return; SDNode *FromNode = From.getNode(); SDNode *ToNode = To.getNode(); ArrayRef DVs = GetDbgValues(FromNode); SmallVector ClonedDVs; for (ArrayRef::iterator I = DVs.begin(), E = DVs.end(); I != E; ++I) { SDDbgValue *Dbg = *I; if (Dbg->getKind() == SDDbgValue::SDNODE) { SDDbgValue *Clone = getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode, To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(), Dbg->getDebugLoc(), Dbg->getOrder()); ClonedDVs.push_back(Clone); } } for (SmallVectorImpl::iterator I = ClonedDVs.begin(), E = ClonedDVs.end(); I != E; ++I) AddDbgValue(*I, ToNode, false); } //===----------------------------------------------------------------------===// // SDNode Class //===----------------------------------------------------------------------===// HandleSDNode::~HandleSDNode() { DropOperands(); } GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order, DebugLoc DL, const GlobalValue *GA, EVT VT, int64_t o, unsigned char TF) : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) { TheGlobal = GA; } AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT, SDValue X, unsigned SrcAS, unsigned DestAS) : UnarySDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT), X), SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {} MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, EVT memvt, MachineMemOperand *mmo) : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) { SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant()); assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!"); assert(isNonTemporal() == MMO->isNonTemporal() && "Non-temporal encoding error!"); // We check here that the size of the memory operand fits within the size of // the MMO. This is because the MMO might indicate only a possible address // range instead of specifying the affected memory addresses precisely. assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!"); } MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops, EVT memvt, MachineMemOperand *mmo) : SDNode(Opc, Order, dl, VTs, Ops), MemoryVT(memvt), MMO(mmo) { SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant()); assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!"); assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!"); } /// Profile - Gather unique data for the node. /// void SDNode::Profile(FoldingSetNodeID &ID) const { AddNodeIDNode(ID, this); } namespace { struct EVTArray { std::vector VTs; EVTArray() { VTs.reserve(MVT::LAST_VALUETYPE); for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i) VTs.push_back(MVT((MVT::SimpleValueType)i)); } }; } static ManagedStatic > EVTs; static ManagedStatic SimpleVTArray; static ManagedStatic > VTMutex; /// getValueTypeList - Return a pointer to the specified value type. /// const EVT *SDNode::getValueTypeList(EVT VT) { if (VT.isExtended()) { sys::SmartScopedLock Lock(*VTMutex); return &(*EVTs->insert(VT).first); } else { assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE && "Value type out of range!"); return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy]; } } /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the /// indicated value. This method ignores uses of other values defined by this /// operation. bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const { assert(Value < getNumValues() && "Bad value!"); // TODO: Only iterate over uses of a given value of the node for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) { if (UI.getUse().getResNo() == Value) { if (NUses == 0) return false; --NUses; } } // Found exactly the right number of uses? return NUses == 0; } /// hasAnyUseOfValue - Return true if there are any use of the indicated /// value. This method ignores uses of other values defined by this operation. bool SDNode::hasAnyUseOfValue(unsigned Value) const { assert(Value < getNumValues() && "Bad value!"); for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) if (UI.getUse().getResNo() == Value) return true; return false; } /// isOnlyUserOf - Return true if this node is the only use of N. /// bool SDNode::isOnlyUserOf(SDNode *N) const { bool Seen = false; for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { SDNode *User = *I; if (User == this) Seen = true; else return false; } return Seen; } /// isOperand - Return true if this node is an operand of N. /// bool SDValue::isOperandOf(SDNode *N) const { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) if (*this == N->getOperand(i)) return true; return false; } bool SDNode::isOperandOf(SDNode *N) const { for (unsigned i = 0, e = N->NumOperands; i != e; ++i) if (this == N->OperandList[i].getNode()) return true; return false; } /// reachesChainWithoutSideEffects - Return true if this operand (which must /// be a chain) reaches the specified operand without crossing any /// side-effecting instructions on any chain path. In practice, this looks /// through token factors and non-volatile loads. In order to remain efficient, /// this only looks a couple of nodes in, it does not do an exhaustive search. bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, unsigned Depth) const { if (*this == Dest) return true; // Don't search too deeply, we just want to be able to see through // TokenFactor's etc. if (Depth == 0) return false; // If this is a token factor, all inputs to the TF happen in parallel. If any // of the operands of the TF does not reach dest, then we cannot do the xform. if (getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = getNumOperands(); i != e; ++i) if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1)) return false; return true; } // Loads don't have side effects, look through them. if (LoadSDNode *Ld = dyn_cast(*this)) { if (!Ld->isVolatile()) return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1); } return false; } /// hasPredecessor - Return true if N is a predecessor of this node. /// N is either an operand of this node, or can be reached by recursively /// traversing up the operands. /// NOTE: This is an expensive method. Use it carefully. bool SDNode::hasPredecessor(const SDNode *N) const { SmallPtrSet Visited; SmallVector Worklist; return hasPredecessorHelper(N, Visited, Worklist); } bool SDNode::hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallVectorImpl &Worklist) const { if (Visited.empty()) { Worklist.push_back(this); } else { // Take a look in the visited set. If we've already encountered this node // we needn't search further. if (Visited.count(N)) return true; } // Haven't visited N yet. Continue the search. while (!Worklist.empty()) { const SDNode *M = Worklist.pop_back_val(); for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) { SDNode *Op = M->getOperand(i).getNode(); if (Visited.insert(Op).second) Worklist.push_back(Op); if (Op == N) return true; } } return false; } uint64_t SDNode::getConstantOperandVal(unsigned Num) const { assert(Num < NumOperands && "Invalid child # of SDNode!"); return cast(OperandList[Num])->getZExtValue(); } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { assert(N->getNumValues() == 1 && "Can't unroll a vector with multiple results!"); EVT VT = N->getValueType(0); unsigned NE = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); SmallVector Scalars; SmallVector Operands(N->getNumOperands()); // If ResNE is 0, fully unroll the vector op. if (ResNE == 0) ResNE = NE; else if (NE > ResNE) NE = ResNE; unsigned i; for (i= 0; i != NE; ++i) { for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) { SDValue Operand = N->getOperand(j); EVT OperandVT = Operand.getValueType(); if (OperandVT.isVector()) { // A vector operand; extract a single element. EVT OperandEltVT = OperandVT.getVectorElementType(); Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand, getConstant(i, TLI->getVectorIdxTy())); } else { // A scalar operand; just use it as is. Operands[j] = Operand; } } switch (N->getOpcode()) { default: Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands)); break; case ISD::VSELECT: Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands)); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0], getShiftAmountOperand(Operands[0].getValueType(), Operands[1]))); break; case ISD::SIGN_EXTEND_INREG: case ISD::FP_ROUND_INREG: { EVT ExtVT = cast(Operands[1])->getVT().getVectorElementType(); Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0], getValueType(ExtVT))); } } } for (; i < ResNE; ++i) Scalars.push_back(getUNDEF(EltVT)); return getNode(ISD::BUILD_VECTOR, dl, EVT::getVectorVT(*getContext(), EltVT, ResNE), Scalars); } /// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a /// location that is 'Dist' units away from the location that the 'Base' load /// is loading from. bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const { if (LD->getChain() != Base->getChain()) return false; EVT VT = LD->getValueType(0); if (VT.getSizeInBits() / 8 != Bytes) return false; SDValue Loc = LD->getOperand(1); SDValue BaseLoc = Base->getOperand(1); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) return false; const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo(); int FI = cast(Loc)->getIndex(); int BFI = cast(BaseLoc)->getIndex(); int FS = MFI->getObjectSize(FI); int BFS = MFI->getObjectSize(BFI); if (FS != BFS || FS != (int)Bytes) return false; return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } // Handle X + C. if (isBaseWithConstantOffset(Loc)) { int64_t LocOffset = cast(Loc.getOperand(1))->getSExtValue(); if (Loc.getOperand(0) == BaseLoc) { // If the base location is a simple address with no offset itself, then // the second load's first add operand should be the base address. if (LocOffset == Dist * (int)Bytes) return true; } else if (isBaseWithConstantOffset(BaseLoc)) { // The base location itself has an offset, so subtract that value from the // second load's offset before comparing to distance * size. int64_t BOffset = cast(BaseLoc.getOperand(1))->getSExtValue(); if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { if ((LocOffset - BOffset) == Dist * (int)Bytes) return true; } } } const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; int64_t Offset1 = 0; int64_t Offset2 = 0; bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1); bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); if (isGA1 && isGA2 && GV1 == GV2) return Offset1 == (Offset2 + Dist*Bytes); return false; } /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if /// it cannot be inferred. unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { // If this is a GlobalAddress + cst, return the alignment. const GlobalValue *GV; int64_t GVOffset = 0; if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType()); APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0); llvm::computeKnownBits(const_cast(GV), KnownZero, KnownOne, TLI->getDataLayout()); unsigned AlignBits = KnownZero.countTrailingOnes(); unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0; if (Align) return MinAlign(Align, GVOffset); } // If this is a direct reference to a stack slot, use information about the // stack slot's alignment. int FrameIdx = 1 << 31; int64_t FrameOffset = 0; if (FrameIndexSDNode *FI = dyn_cast(Ptr)) { FrameIdx = FI->getIndex(); } else if (isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { // Handle FI+Cst FrameIdx = cast(Ptr.getOperand(0))->getIndex(); FrameOffset = Ptr.getConstantOperandVal(1); } if (FrameIdx != (1 << 31)) { const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo(); unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx), FrameOffset); return FIInfoAlign; } return 0; } /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type /// which is split (or expanded) into two not necessarily identical pieces. std::pair SelectionDAG::GetSplitDestVTs(const EVT &VT) const { // Currently all types are split in half. EVT LoVT, HiVT; if (!VT.isVector()) { LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT); } else { unsigned NumElements = VT.getVectorNumElements(); assert(!(NumElements & 1) && "Splitting vector, but not in half!"); LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(), NumElements/2); } return std::make_pair(LoVT, HiVT); } /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the /// low/high part. std::pair SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT) { assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <= N.getValueType().getVectorNumElements() && "More vector elements requested than available!"); SDValue Lo, Hi; Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, getConstant(0, TLI->getVectorIdxTy())); Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N, getConstant(LoVT.getVectorNumElements(), TLI->getVectorIdxTy())); return std::make_pair(Lo, Hi); } void SelectionDAG::ExtractVectorElements(SDValue Op, SmallVectorImpl &Args, unsigned Start, unsigned Count) { EVT VT = Op.getValueType(); if (Count == 0) Count = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); EVT IdxTy = TLI->getVectorIdxTy(); SDLoc SL(Op); for (unsigned i = Start, e = Start + Count; i != e; ++i) { Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Op, getConstant(i, IdxTy))); } } // getAddressSpace - Return the address space this GlobalAddress belongs to. unsigned GlobalAddressSDNode::getAddressSpace() const { return getGlobal()->getType()->getAddressSpace(); } Type *ConstantPoolSDNode::getType() const { if (isMachineConstantPoolEntry()) return Val.MachineCPVal->getType(); return Val.ConstVal->getType(); } bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits, bool isBigEndian) const { EVT VT = getValueType(0); assert(VT.isVector() && "Expected a vector type"); unsigned sz = VT.getSizeInBits(); if (MinSplatBits > sz) return false; SplatValue = APInt(sz, 0); SplatUndef = APInt(sz, 0); // Get the bits. Bits with undefined values (when the corresponding element // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared // in SplatValue. If any of the values are not constant, give up and return // false. unsigned int nOps = getNumOperands(); assert(nOps > 0 && "isConstantSplat has 0-size build vector"); unsigned EltBitSize = VT.getVectorElementType().getSizeInBits(); for (unsigned j = 0; j < nOps; ++j) { unsigned i = isBigEndian ? nOps-1-j : j; SDValue OpVal = getOperand(i); unsigned BitPos = j * EltBitSize; if (OpVal.getOpcode() == ISD::UNDEF) SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize); else if (ConstantSDNode *CN = dyn_cast(OpVal)) SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize). zextOrTrunc(sz) << BitPos; else if (ConstantFPSDNode *CN = dyn_cast(OpVal)) SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) < 8) { unsigned HalfSize = sz / 2; APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize); APInt LowValue = SplatValue.trunc(HalfSize); APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize); APInt LowUndef = SplatUndef.trunc(HalfSize); // If the two halves do not match (ignoring undef bits), stop here. if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) || MinSplatBits > HalfSize) break; SplatValue = HighValue | LowValue; SplatUndef = HighUndef & LowUndef; sz = HalfSize; } SplatBitSize = sz; return true; } SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const { if (UndefElements) { UndefElements->clear(); UndefElements->resize(getNumOperands()); } SDValue Splatted; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { SDValue Op = getOperand(i); if (Op.getOpcode() == ISD::UNDEF) { if (UndefElements) (*UndefElements)[i] = true; } else if (!Splatted) { Splatted = Op; } else if (Splatted != Op) { return SDValue(); } } if (!Splatted) { assert(getOperand(0).getOpcode() == ISD::UNDEF && "Can only have a splat without a constant for all undefs."); return getOperand(0); } return Splatted; } ConstantSDNode * BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const { return dyn_cast_or_null( getSplatValue(UndefElements).getNode()); } ConstantFPSDNode * BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const { return dyn_cast_or_null( getSplatValue(UndefElements).getNode()); } bool BuildVectorSDNode::isConstant() const { for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { unsigned Opc = getOperand(i).getOpcode(); if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP) return false; } return true; } bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { // Find the first non-undef value in the shuffle mask. unsigned i, e; for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i) /* search */; assert(i != e && "VECTOR_SHUFFLE node with all undef indices!"); // Make sure all remaining elements are either undef or the same as the first // non-undef value. for (int Idx = Mask[i]; i != e; ++i) if (Mask[i] >= 0 && Mask[i] != Idx) return false; return true; } #ifndef NDEBUG static void checkForCyclesHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallPtrSetImpl &Checked, const llvm::SelectionDAG *DAG) { // If this node has already been checked, don't check it again. if (Checked.count(N)) return; // If a node has already been visited on this depth-first walk, reject it as // a cycle. if (!Visited.insert(N).second) { errs() << "Detected cycle in SelectionDAG\n"; dbgs() << "Offending node:\n"; N->dumprFull(DAG); dbgs() << "\n"; abort(); } for(unsigned i = 0, e = N->getNumOperands(); i != e; ++i) checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked, DAG); Checked.insert(N); Visited.erase(N); } #endif void llvm::checkForCycles(const llvm::SDNode *N, const llvm::SelectionDAG *DAG, bool force) { #ifndef NDEBUG bool check = force; #ifdef XDEBUG check = true; #endif // XDEBUG if (check) { assert(N && "Checking nonexistent SDNode"); SmallPtrSet visited; SmallPtrSet checked; checkForCyclesHelper(N, visited, checked, DAG); } #endif // !NDEBUG } void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) { checkForCycles(DAG->getRoot().getNode(), DAG, force); } Index: projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 279025) @@ -1,7954 +1,7956 @@ //===-- SelectionDAGBuilder.cpp - Selection-DAG building ------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This implements routines for translating from LLVM IR into SelectionDAG IR. // //===----------------------------------------------------------------------===// #include "SelectionDAGBuilder.h" #include "SDNodeDbgValue.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Statepoint.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; #define DEBUG_TYPE "isel" /// LimitFloatPrecision - Generate low-precision inline sequences for /// some float libcalls (6, 8 or 12 bits). static unsigned LimitFloatPrecision; static cl::opt LimitFPPrecision("limit-float-precision", cl::desc("Generate low-precision inline sequences " "for some float libcalls"), cl::location(LimitFloatPrecision), cl::init(0)); // Limit the width of DAG chains. This is important in general to prevent // prevent DAG-based analysis from blowing up. For example, alias analysis and // load clustering may not complete in reasonable time. It is difficult to // recognize and avoid this situation within each individual analysis, and // future analyses are likely to have the same behavior. Limiting DAG width is // the safe approach, and will be especially important with global DAGs. // // MaxParallelChains default is arbitrarily high to avoid affecting // optimization, but could be lowered to improve compile time. Any ld-ld-st-st // sequence over this should have been converted to llvm.memcpy by the // frontend. It easy to induce this behavior with .ll code such as: // %buffer = alloca [4096 x i8] // %data = load [4096 x i8]* %argPtr // store [4096 x i8] %data, [4096 x i8]* %buffer static const unsigned MaxParallelChains = 64; static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V); /// getCopyFromParts - Create a value that contains the specified legal parts /// combined into the value they represent. If the parts combine to a type /// larger then ValueVT then AssertOp can be used to specify whether the extra /// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT /// (ISD::AssertSext). static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, ISD::NodeType AssertOp = ISD::DELETED_NODE) { if (ValueVT.isVector()) return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V); assert(NumParts > 0 && "No parts to assemble!"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Val = Parts[0]; if (NumParts > 1) { // Assemble the value from multiple parts. if (ValueVT.isInteger()) { unsigned PartBits = PartVT.getSizeInBits(); unsigned ValueBits = ValueVT.getSizeInBits(); // Assemble the power of 2 part. unsigned RoundParts = NumParts & (NumParts - 1) ? 1 << Log2_32(NumParts) : NumParts; unsigned RoundBits = PartBits * RoundParts; EVT RoundVT = RoundBits == ValueBits ? ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits); SDValue Lo, Hi; EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2); if (RoundParts > 2) { Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, PartVT, HalfVT, V); Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, RoundParts / 2, PartVT, HalfVT, V); } else { Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]); Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]); } if (TLI.isBigEndian()) std::swap(Lo, Hi); Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi); if (RoundParts < NumParts) { // Assemble the trailing non-power-of-2 part. unsigned OddParts = NumParts - RoundParts; EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits); Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT, OddVT, V); // Combine the round and odd parts. Lo = Val; if (TLI.isBigEndian()) std::swap(Lo, Hi); EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits); Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi); Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi, DAG.getConstant(Lo.getValueType().getSizeInBits(), TLI.getPointerTy())); Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo); Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi); } } else if (PartVT.isFloatingPoint()) { // FP split into multiple FP parts (for ppcf128) assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 && "Unexpected split"); SDValue Lo, Hi; Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]); Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]); if (TLI.hasBigEndianPartOrdering(ValueVT)) std::swap(Lo, Hi); Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi); } else { // FP split into integer parts (soft fp) assert(ValueVT.isFloatingPoint() && PartVT.isInteger() && !PartVT.isVector() && "Unexpected split"); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()); Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V); } } // There is now one part, held in Val. Correct it to match ValueVT. EVT PartEVT = Val.getValueType(); if (PartEVT == ValueVT) return Val; if (PartEVT.isInteger() && ValueVT.isInteger()) { if (ValueVT.bitsLT(PartEVT)) { // For a truncate, see if we have any information to // indicate whether the truncated bits will always be // zero or sign-extension. if (AssertOp != ISD::DELETED_NODE) Val = DAG.getNode(AssertOp, DL, PartEVT, Val, DAG.getValueType(ValueVT)); return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val); } return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val); } if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) { // FP_ROUND's are always exact here. if (ValueVT.bitsLT(Val.getValueType())) return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val, DAG.getTargetConstant(1, TLI.getPointerTy())); return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val); } if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits()) return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); llvm_unreachable("Unknown mismatch!"); } static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V, const Twine &ErrMsg) { const Instruction *I = dyn_cast_or_null(V); if (!V) return Ctx.emitError(ErrMsg); const char *AsmError = ", possible invalid constraint for vector type"; if (const CallInst *CI = dyn_cast(I)) if (isa(CI->getCalledValue())) return Ctx.emitError(I, ErrMsg + AsmError); return Ctx.emitError(I, ErrMsg); } /// getCopyFromPartsVector - Create a value that contains the specified legal /// parts combined into the value they represent. If the parts combine to a /// type larger then ValueVT then AssertOp can be used to specify whether the /// extra bits are known to be zero (ISD::AssertZext) or sign extended from /// ValueVT (ISD::AssertSext). static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V) { assert(ValueVT.isVector() && "Not a vector value"); assert(NumParts > 0 && "No parts to assemble!"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Val = Parts[0]; // Handle a multi-element vector. if (NumParts > 1) { EVT IntermediateVT; MVT RegisterVT; unsigned NumIntermediates; unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates, RegisterVT); assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!"); NumParts = NumRegs; // Silence a compiler warning. assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!"); assert(RegisterVT == Parts[0].getSimpleValueType() && "Part type doesn't match part!"); // Assemble the parts into intermediate operands. SmallVector Ops(NumIntermediates); if (NumIntermediates == NumParts) { // If the register was not expanded, truncate or copy the value, // as appropriate. for (unsigned i = 0; i != NumParts; ++i) Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, PartVT, IntermediateVT, V); } else if (NumParts > 0) { // If the intermediate type was expanded, build the intermediate // operands from the parts. assert(NumParts % NumIntermediates == 0 && "Must expand into a divisible number of parts!"); unsigned Factor = NumParts / NumIntermediates; for (unsigned i = 0; i != NumIntermediates; ++i) Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, PartVT, IntermediateVT, V); } // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the // intermediate operands. Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, ValueVT, Ops); } // There is now one part, held in Val. Correct it to match ValueVT. EVT PartEVT = Val.getValueType(); if (PartEVT == ValueVT) return Val; if (PartEVT.isVector()) { // If the element type of the source/dest vectors are the same, but the // parts vector has more elements than the value vector, then we have a // vector widening case (e.g. <2 x float> -> <4 x float>). Extract the // elements we want. if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) { assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() && "Cannot narrow, it would be a lossy transformation"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val, DAG.getConstant(0, TLI.getVectorIdxTy())); } // Vector/Vector bitcast. if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() && "Cannot handle this kind of promotion"); // Promoted vector extract bool Smaller = ValueVT.bitsLE(PartEVT); return DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), DL, ValueVT, Val); } // Trivial bitcast if the types are the same size and the destination // vector type is legal. if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() && TLI.isTypeLegal(ValueVT)) return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); // Handle cases such as i8 -> <1 x i1> if (ValueVT.getVectorNumElements() != 1) { diagnosePossiblyInvalidConstraint(*DAG.getContext(), V, "non-trivial scalar-to-vector conversion"); return DAG.getUNDEF(ValueVT); } if (ValueVT.getVectorNumElements() == 1 && ValueVT.getVectorElementType() != PartEVT) { bool Smaller = ValueVT.bitsLE(PartEVT); Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), DL, ValueVT.getScalarType(), Val); } return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); } static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc dl, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, const Value *V); /// getCopyToParts - Create a series of nodes that contain the specified value /// split into legal parts. If the parts contain more bits than Val, then, for /// integers, ExtendKind can be used to specify how to generate the extra bits. static void getCopyToParts(SelectionDAG &DAG, SDLoc DL, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, const Value *V, ISD::NodeType ExtendKind = ISD::ANY_EXTEND) { EVT ValueVT = Val.getValueType(); // Handle the vector case separately. if (ValueVT.isVector()) return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned PartBits = PartVT.getSizeInBits(); unsigned OrigNumParts = NumParts; assert(TLI.isTypeLegal(PartVT) && "Copying to an illegal type!"); if (NumParts == 0) return; assert(!ValueVT.isVector() && "Vector case handled elsewhere"); EVT PartEVT = PartVT; if (PartEVT == ValueVT) { assert(NumParts == 1 && "No-op copy with multiple parts!"); Parts[0] = Val; return; } if (NumParts * PartBits > ValueVT.getSizeInBits()) { // If the parts cover more bits than the value has, promote the value. if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) { assert(NumParts == 1 && "Do not know what to promote to!"); Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val); } else { assert((PartVT.isInteger() || PartVT == MVT::x86mmx) && ValueVT.isInteger() && "Unknown mismatch!"); ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits); Val = DAG.getNode(ExtendKind, DL, ValueVT, Val); if (PartVT == MVT::x86mmx) Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); } } else if (PartBits == ValueVT.getSizeInBits()) { // Different types of the same size. assert(NumParts == 1 && PartEVT != ValueVT); Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); } else if (NumParts * PartBits < ValueVT.getSizeInBits()) { // If the parts cover less bits than value has, truncate the value. assert((PartVT.isInteger() || PartVT == MVT::x86mmx) && ValueVT.isInteger() && "Unknown mismatch!"); ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits); Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val); if (PartVT == MVT::x86mmx) Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); } // The value may have changed - recompute ValueVT. ValueVT = Val.getValueType(); assert(NumParts * PartBits == ValueVT.getSizeInBits() && "Failed to tile the value with PartVT!"); if (NumParts == 1) { if (PartEVT != ValueVT) diagnosePossiblyInvalidConstraint(*DAG.getContext(), V, "scalar-to-vector conversion failed"); Parts[0] = Val; return; } // Expand the value into multiple parts. if (NumParts & (NumParts - 1)) { // The number of parts is not a power of 2. Split off and copy the tail. assert(PartVT.isInteger() && ValueVT.isInteger() && "Do not know what to expand to!"); unsigned RoundParts = 1 << Log2_32(NumParts); unsigned RoundBits = RoundParts * PartBits; unsigned OddParts = NumParts - RoundParts; SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val, DAG.getIntPtrConstant(RoundBits)); getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V); if (TLI.isBigEndian()) // The odd parts were reversed by getCopyToParts - unreverse them. std::reverse(Parts + RoundParts, Parts + NumParts); NumParts = RoundParts; ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits); Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val); } // The number of parts is a power of 2. Repeatedly bisect the value using // EXTRACT_ELEMENT. Parts[0] = DAG.getNode(ISD::BITCAST, DL, EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()), Val); for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) { for (unsigned i = 0; i < NumParts; i += StepSize) { unsigned ThisBits = StepSize * PartBits / 2; EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits); SDValue &Part0 = Parts[i]; SDValue &Part1 = Parts[i+StepSize/2]; Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, ThisVT, Part0, DAG.getIntPtrConstant(1)); Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, ThisVT, Part0, DAG.getIntPtrConstant(0)); if (ThisBits == PartBits && ThisVT != PartVT) { Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0); Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1); } } } if (TLI.isBigEndian()) std::reverse(Parts, Parts + OrigNumParts); } /// getCopyToPartsVector - Create a series of nodes that contain the specified /// value split into legal parts. static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, const Value *V) { EVT ValueVT = Val.getValueType(); assert(ValueVT.isVector() && "Not a vector"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (NumParts == 1) { EVT PartEVT = PartVT; if (PartEVT == ValueVT) { // Nothing to do. } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) { // Bitconvert vector->vector case. Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); } else if (PartVT.isVector() && PartEVT.getVectorElementType() == ValueVT.getVectorElementType() && PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) { EVT ElementVT = PartVT.getVectorElementType(); // Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in // undef elements. SmallVector Ops; for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i) Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementVT, Val, DAG.getConstant(i, TLI.getVectorIdxTy()))); for (unsigned i = ValueVT.getVectorNumElements(), e = PartVT.getVectorNumElements(); i != e; ++i) Ops.push_back(DAG.getUNDEF(ElementVT)); Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops); // FIXME: Use CONCAT for 2x -> 4x. //SDValue UndefElts = DAG.getUNDEF(VectorTy); //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts); } else if (PartVT.isVector() && PartEVT.getVectorElementType().bitsGE( ValueVT.getVectorElementType()) && PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) { // Promoted vector extract bool Smaller = PartEVT.bitsLE(ValueVT); Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), DL, PartVT, Val); } else{ // Vector -> scalar conversion. assert(ValueVT.getVectorNumElements() == 1 && "Only trivial vector-to-scalar conversions should get here!"); Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, DAG.getConstant(0, TLI.getVectorIdxTy())); bool Smaller = ValueVT.bitsLE(PartVT); Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), DL, PartVT, Val); } Parts[0] = Val; return; } // Handle a multi-element vector. EVT IntermediateVT; MVT RegisterVT; unsigned NumIntermediates; unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates, RegisterVT); unsigned NumElements = ValueVT.getVectorNumElements(); assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!"); NumParts = NumRegs; // Silence a compiler warning. assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!"); // Split the vector into intermediate operands. SmallVector Ops(NumIntermediates); for (unsigned i = 0; i != NumIntermediates; ++i) { if (IntermediateVT.isVector()) Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val, DAG.getConstant(i * (NumElements / NumIntermediates), TLI.getVectorIdxTy())); else Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val, DAG.getConstant(i, TLI.getVectorIdxTy())); } // Split the intermediate operands into legal parts. if (NumParts == NumIntermediates) { // If the register was not expanded, promote or copy the value, // as appropriate. for (unsigned i = 0; i != NumParts; ++i) getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V); } else if (NumParts > 0) { // If the intermediate type was expanded, split each the value into // legal parts. assert(NumIntermediates != 0 && "division by zero"); assert(NumParts % NumIntermediates == 0 && "Must expand into a divisible number of parts!"); unsigned Factor = NumParts / NumIntermediates; for (unsigned i = 0; i != NumIntermediates; ++i) getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT, V); } } namespace { /// RegsForValue - This struct represents the registers (physical or virtual) /// that a particular set of values is assigned, and the type information /// about the value. The most common situation is to represent one value at a /// time, but struct or array values are handled element-wise as multiple /// values. The splitting of aggregates is performed recursively, so that we /// never have aggregate-typed registers. The values at this point do not /// necessarily have legal types, so each value may require one or more /// registers of some legal type. /// struct RegsForValue { /// ValueVTs - The value types of the values, which may not be legal, and /// may need be promoted or synthesized from one or more registers. /// SmallVector ValueVTs; /// RegVTs - The value types of the registers. This is the same size as /// ValueVTs and it records, for each value, what the type of the assigned /// register or registers are. (Individual values are never synthesized /// from more than one type of register.) /// /// With virtual registers, the contents of RegVTs is redundant with TLI's /// getRegisterType member function, however when with physical registers /// it is necessary to have a separate record of the types. /// SmallVector RegVTs; /// Regs - This list holds the registers assigned to the values. /// Each legal or promoted value requires one register, and each /// expanded value requires multiple registers. /// SmallVector Regs; RegsForValue() {} RegsForValue(const SmallVector ®s, MVT regvt, EVT valuevt) : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {} RegsForValue(LLVMContext &Context, const TargetLowering &tli, unsigned Reg, Type *Ty) { ComputeValueVTs(tli, Ty, ValueVTs); for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) { EVT ValueVT = ValueVTs[Value]; unsigned NumRegs = tli.getNumRegisters(Context, ValueVT); MVT RegisterVT = tli.getRegisterType(Context, ValueVT); for (unsigned i = 0; i != NumRegs; ++i) Regs.push_back(Reg + i); RegVTs.push_back(RegisterVT); Reg += NumRegs; } } /// append - Add the specified values to this one. void append(const RegsForValue &RHS) { ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end()); RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end()); Regs.append(RHS.Regs.begin(), RHS.Regs.end()); } /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from /// this value and returns the result as a ValueVTs value. This uses /// Chain/Flag as the input and updates them for the output Chain/Flag. /// If the Flag pointer is NULL, no flag is used. SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, SDLoc dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr) const; /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the /// specified value into the registers specified by this object. This uses /// Chain/Flag as the input and updates them for the output Chain/Flag. /// If the Flag pointer is NULL, no flag is used. void getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, SDValue *Flag, const Value *V, ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const; /// AddInlineAsmOperands - Add this value to the specified inlineasm node /// operand list. This adds the code marker, matching input operand index /// (if applicable), and includes the number of values added into it. void AddInlineAsmOperands(unsigned Kind, bool HasMatching, unsigned MatchingIdx, SelectionDAG &DAG, std::vector &Ops) const; }; } /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from /// this value and returns the result as a ValueVT value. This uses /// Chain/Flag as the input and updates them for the output Chain/Flag. /// If the Flag pointer is NULL, no flag is used. SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, SDLoc dl, SDValue &Chain, SDValue *Flag, const Value *V) const { // A Value with type {} or [0 x %t] needs no registers. if (ValueVTs.empty()) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Assemble the legal parts into the final values. SmallVector Values(ValueVTs.size()); SmallVector Parts; for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) { // Copy the legal parts from the registers. EVT ValueVT = ValueVTs[Value]; unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT); MVT RegisterVT = RegVTs[Value]; Parts.resize(NumRegs); for (unsigned i = 0; i != NumRegs; ++i) { SDValue P; if (!Flag) { P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT); } else { P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag); *Flag = P.getValue(2); } Chain = P.getValue(1); Parts[i] = P; // If the source register was virtual and if we know something about it, // add an assert node. if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) || !RegisterVT.isInteger() || RegisterVT.isVector()) continue; const FunctionLoweringInfo::LiveOutInfo *LOI = FuncInfo.GetLiveOutRegInfo(Regs[Part+i]); if (!LOI) continue; unsigned RegSize = RegisterVT.getSizeInBits(); unsigned NumSignBits = LOI->NumSignBits; unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes(); if (NumZeroBits == RegSize) { // The current value is a zero. // Explicitly express that as it would be easier for // optimizations to kick in. Parts[i] = DAG.getConstant(0, RegisterVT); continue; } // FIXME: We capture more information than the dag can represent. For // now, just use the tightest assertzext/assertsext possible. bool isSExt = true; EVT FromVT(MVT::Other); if (NumSignBits == RegSize) isSExt = true, FromVT = MVT::i1; // ASSERT SEXT 1 else if (NumZeroBits >= RegSize-1) isSExt = false, FromVT = MVT::i1; // ASSERT ZEXT 1 else if (NumSignBits > RegSize-8) isSExt = true, FromVT = MVT::i8; // ASSERT SEXT 8 else if (NumZeroBits >= RegSize-8) isSExt = false, FromVT = MVT::i8; // ASSERT ZEXT 8 else if (NumSignBits > RegSize-16) isSExt = true, FromVT = MVT::i16; // ASSERT SEXT 16 else if (NumZeroBits >= RegSize-16) isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16 else if (NumSignBits > RegSize-32) isSExt = true, FromVT = MVT::i32; // ASSERT SEXT 32 else if (NumZeroBits >= RegSize-32) isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32 else continue; // Add an assertion node. assert(FromVT != MVT::Other); Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl, RegisterVT, P, DAG.getValueType(FromVT)); } Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs, RegisterVT, ValueVT, V); Part += NumRegs; Parts.clear(); } return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values); } /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the /// specified value into the registers specified by this object. This uses /// Chain/Flag as the input and updates them for the output Chain/Flag. /// If the Flag pointer is NULL, no flag is used. void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, SDValue *Flag, const Value *V, ISD::NodeType PreferredExtendType) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); ISD::NodeType ExtendKind = PreferredExtendType; // Get the list of the values's legal parts. unsigned NumRegs = Regs.size(); SmallVector Parts(NumRegs); for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) { EVT ValueVT = ValueVTs[Value]; unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT); MVT RegisterVT = RegVTs[Value]; if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT)) ExtendKind = ISD::ZERO_EXTEND; getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value), &Parts[Part], NumParts, RegisterVT, V, ExtendKind); Part += NumParts; } // Copy the parts into the registers. SmallVector Chains(NumRegs); for (unsigned i = 0; i != NumRegs; ++i) { SDValue Part; if (!Flag) { Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]); } else { Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag); *Flag = Part.getValue(1); } Chains[i] = Part.getValue(0); } if (NumRegs == 1 || Flag) // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is // flagged to it. That is the CopyToReg nodes and the user are considered // a single scheduling unit. If we create a TokenFactor and return it as // chain, then the TokenFactor is both a predecessor (operand) of the // user as well as a successor (the TF operands are flagged to the user). // c1, f1 = CopyToReg // c2, f2 = CopyToReg // c3 = TokenFactor c1, c2 // ... // = op c3, ..., f2 Chain = Chains[NumRegs-1]; else Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } /// AddInlineAsmOperands - Add this value to the specified inlineasm node /// operand list. This adds the code marker and includes the number of /// values added into it. void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, unsigned MatchingIdx, SelectionDAG &DAG, std::vector &Ops) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size()); if (HasMatching) Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx); else if (!Regs.empty() && TargetRegisterInfo::isVirtualRegister(Regs.front())) { // Put the register class of the virtual registers in the flag word. That // way, later passes can recompute register class constraints for inline // assembly as well as normal instructions. // Don't do this for tied operands that can use the regclass information // from the def. const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Regs.front()); Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID()); } SDValue Res = DAG.getTargetConstant(Flag, MVT::i32); Ops.push_back(Res); unsigned SP = TLI.getStackPointerRegisterToSaveRestore(); for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) { unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]); MVT RegisterVT = RegVTs[Value]; for (unsigned i = 0; i != NumRegs; ++i) { assert(Reg < Regs.size() && "Mismatch in # registers expected"); unsigned TheReg = Regs[Reg++]; Ops.push_back(DAG.getRegister(TheReg, RegisterVT)); if (TheReg == SP && Code == InlineAsm::Kind_Clobber) { // If we clobbered the stack pointer, MFI should know about it. assert(DAG.getMachineFunction().getFrameInfo()-> hasInlineAsmWithSPAdjust()); } } } } void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, const TargetLibraryInfo *li) { AA = &aa; GFI = gfi; LibInfo = li; DL = DAG.getSubtarget().getDataLayout(); Context = DAG.getContext(); LPadToCallSiteMap.clear(); } /// clear - Clear out the current SelectionDAG and the associated /// state and prepare this SelectionDAGBuilder object to be used /// for a new block. This doesn't clear out information about /// additional blocks that are needed to complete switch lowering /// or PHI node updating; that information is cleared out as it is /// consumed. void SelectionDAGBuilder::clear() { NodeMap.clear(); UnusedArgNodeMap.clear(); PendingLoads.clear(); PendingExports.clear(); CurInst = nullptr; HasTailCall = false; SDNodeOrder = LowestSDNodeOrder; StatepointLowering.clear(); } /// clearDanglingDebugInfo - Clear the dangling debug information /// map. This function is separated from the clear so that debug /// information that is dangling in a basic block can be properly /// resolved in a different basic block. This allows the /// SelectionDAG to resolve dangling debug information attached /// to PHI nodes. void SelectionDAGBuilder::clearDanglingDebugInfo() { DanglingDebugInfoMap.clear(); } /// getRoot - Return the current virtual root of the Selection DAG, /// flushing any PendingLoad items. This must be done before emitting /// a store or any other node that may need to be ordered after any /// prior load instructions. /// SDValue SelectionDAGBuilder::getRoot() { if (PendingLoads.empty()) return DAG.getRoot(); if (PendingLoads.size() == 1) { SDValue Root = PendingLoads[0]; DAG.setRoot(Root); PendingLoads.clear(); return Root; } // Otherwise, we have to make a token factor node. SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, PendingLoads); PendingLoads.clear(); DAG.setRoot(Root); return Root; } /// getControlRoot - Similar to getRoot, but instead of flushing all the /// PendingLoad items, flush all the PendingExports items. It is necessary /// to do this before emitting a terminator instruction. /// SDValue SelectionDAGBuilder::getControlRoot() { SDValue Root = DAG.getRoot(); if (PendingExports.empty()) return Root; // Turn all of the CopyToReg chains into one factored node. if (Root.getOpcode() != ISD::EntryToken) { unsigned i = 0, e = PendingExports.size(); for (; i != e; ++i) { assert(PendingExports[i].getNode()->getNumOperands() > 1); if (PendingExports[i].getNode()->getOperand(0) == Root) break; // Don't add the root if we already indirectly depend on it. } if (i == e) PendingExports.push_back(Root); } Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, PendingExports); PendingExports.clear(); DAG.setRoot(Root); return Root; } void SelectionDAGBuilder::visit(const Instruction &I) { // Set up outgoing PHI node register values before emitting the terminator. if (isa(&I)) HandlePHINodesInSuccessorBlocks(I.getParent()); ++SDNodeOrder; CurInst = &I; visit(I.getOpcode(), I); if (!isa(&I) && !HasTailCall) CopyToExportRegsIfNeeded(&I); CurInst = nullptr; } void SelectionDAGBuilder::visitPHI(const PHINode &) { llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!"); } void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) { // Note: this doesn't use InstVisitor, because it has to work with // ConstantExpr's in addition to instructions. switch (Opcode) { default: llvm_unreachable("Unknown instruction type encountered!"); // Build the switch statement using the Instruction.def file. #define HANDLE_INST(NUM, OPCODE, CLASS) \ case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break; #include "llvm/IR/Instruction.def" } } // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V, // generate the debug data structures now that we've seen its definition. void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V, SDValue Val) { DanglingDebugInfo &DDI = DanglingDebugInfoMap[V]; if (DDI.getDI()) { const DbgValueInst *DI = DDI.getDI(); DebugLoc dl = DDI.getdl(); unsigned DbgSDNodeOrder = DDI.getSDNodeOrder(); MDNode *Variable = DI->getVariable(); MDNode *Expr = DI->getExpression(); uint64_t Offset = DI->getOffset(); // A dbg.value for an alloca is always indirect. bool IsIndirect = isa(V) || Offset != 0; SDDbgValue *SDV; if (Val.getNode()) { if (!EmitFuncArgumentDbgValue(V, Variable, Expr, Offset, IsIndirect, Val)) { SDV = DAG.getDbgValue(Variable, Expr, Val.getNode(), Val.getResNo(), IsIndirect, Offset, dl, DbgSDNodeOrder); DAG.AddDbgValue(SDV, Val.getNode(), false); } } else DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n"); DanglingDebugInfoMap[V] = DanglingDebugInfo(); } } /// getValue - Return an SDValue for the given Value. SDValue SelectionDAGBuilder::getValue(const Value *V) { // If we already have an SDValue for this value, use it. It's important // to do this first, so that we don't create a CopyFromReg if we already // have a regular SDValue. SDValue &N = NodeMap[V]; if (N.getNode()) return N; // If there's a virtual register allocated and initialized for this // value, use it. DenseMap::iterator It = FuncInfo.ValueMap.find(V); if (It != FuncInfo.ValueMap.end()) { unsigned InReg = It->second; RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), InReg, V->getType()); SDValue Chain = DAG.getEntryNode(); N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V); resolveDanglingDebugInfo(V, N); return N; } // Otherwise create a new SDValue and remember it. SDValue Val = getValueImpl(V); NodeMap[V] = Val; resolveDanglingDebugInfo(V, Val); return Val; } /// getNonRegisterValue - Return an SDValue for the given Value, but /// don't look in FuncInfo.ValueMap for a virtual register. SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) { // If we already have an SDValue for this value, use it. SDValue &N = NodeMap[V]; if (N.getNode()) return N; // Otherwise create a new SDValue and remember it. SDValue Val = getValueImpl(V); NodeMap[V] = Val; resolveDanglingDebugInfo(V, Val); return Val; } /// getValueImpl - Helper function for getValue and getNonRegisterValue. /// Create an SDValue for the given value. SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (const Constant *C = dyn_cast(V)) { EVT VT = TLI.getValueType(V->getType(), true); if (const ConstantInt *CI = dyn_cast(C)) return DAG.getConstant(*CI, VT); if (const GlobalValue *GV = dyn_cast(C)) return DAG.getGlobalAddress(GV, getCurSDLoc(), VT); if (isa(C)) { unsigned AS = V->getType()->getPointerAddressSpace(); return DAG.getConstant(0, TLI.getPointerTy(AS)); } if (const ConstantFP *CFP = dyn_cast(C)) return DAG.getConstantFP(*CFP, VT); if (isa(C) && !V->getType()->isAggregateType()) return DAG.getUNDEF(VT); if (const ConstantExpr *CE = dyn_cast(C)) { visit(CE->getOpcode(), *CE); SDValue N1 = NodeMap[V]; assert(N1.getNode() && "visit didn't populate the NodeMap!"); return N1; } if (isa(C) || isa(C)) { SmallVector Constants; for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end(); OI != OE; ++OI) { SDNode *Val = getValue(*OI).getNode(); // If the operand is an empty aggregate, there are no values. if (!Val) continue; // Add each leaf value from the operand to the Constants list // to form a flattened list of all the values. for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i) Constants.push_back(SDValue(Val, i)); } return DAG.getMergeValues(Constants, getCurSDLoc()); } if (const ConstantDataSequential *CDS = dyn_cast(C)) { SmallVector Ops; for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode(); // Add each leaf value from the operand to the Constants list // to form a flattened list of all the values. for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i) Ops.push_back(SDValue(Val, i)); } if (isa(CDS->getType())) return DAG.getMergeValues(Ops, getCurSDLoc()); return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops); } if (C->getType()->isStructTy() || C->getType()->isArrayTy()) { assert((isa(C) || isa(C)) && "Unknown struct or array constant!"); SmallVector ValueVTs; ComputeValueVTs(TLI, C->getType(), ValueVTs); unsigned NumElts = ValueVTs.size(); if (NumElts == 0) return SDValue(); // empty struct SmallVector Constants(NumElts); for (unsigned i = 0; i != NumElts; ++i) { EVT EltVT = ValueVTs[i]; if (isa(C)) Constants[i] = DAG.getUNDEF(EltVT); else if (EltVT.isFloatingPoint()) Constants[i] = DAG.getConstantFP(0, EltVT); else Constants[i] = DAG.getConstant(0, EltVT); } return DAG.getMergeValues(Constants, getCurSDLoc()); } if (const BlockAddress *BA = dyn_cast(C)) return DAG.getBlockAddress(BA, VT); VectorType *VecTy = cast(V->getType()); unsigned NumElements = VecTy->getNumElements(); // Now that we know the number and type of the elements, get that number of // elements into the Ops array based on what kind of constant it is. SmallVector Ops; if (const ConstantVector *CV = dyn_cast(C)) { for (unsigned i = 0; i != NumElements; ++i) Ops.push_back(getValue(CV->getOperand(i))); } else { assert(isa(C) && "Unknown vector constant!"); EVT EltVT = TLI.getValueType(VecTy->getElementType()); SDValue Op; if (EltVT.isFloatingPoint()) Op = DAG.getConstantFP(0, EltVT); else Op = DAG.getConstant(0, EltVT); Ops.assign(NumElements, Op); } // Create a BUILD_VECTOR node. return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops); } // If this is a static alloca, generate it as the frameindex instead of // computation. if (const AllocaInst *AI = dyn_cast(V)) { DenseMap::iterator SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) return DAG.getFrameIndex(SI->second, TLI.getPointerTy()); } // If this is an instruction which fast-isel has deferred, select it now. if (const Instruction *Inst = dyn_cast(V)) { unsigned InReg = FuncInfo.InitializeRegForValue(Inst); RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType()); SDValue Chain = DAG.getEntryNode(); return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V); } llvm_unreachable("Can't get register for value!"); } void SelectionDAGBuilder::visitRet(const ReturnInst &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Chain = getControlRoot(); SmallVector Outs; SmallVector OutVals; if (!FuncInfo.CanLowerReturn) { unsigned DemoteReg = FuncInfo.DemoteRegister; const Function *F = I.getParent()->getParent(); // Emit a store of the return value through the virtual register. // Leave Outs empty so that LowerReturn won't try to load return // registers the usual way. SmallVector PtrValueVTs; ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()), PtrValueVTs); SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]); SDValue RetOp = getValue(I.getOperand(0)); SmallVector ValueVTs; SmallVector Offsets; ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets); unsigned NumValues = ValueVTs.size(); SmallVector Chains(NumValues); for (unsigned i = 0; i != NumValues; ++i) { SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), RetPtr.getValueType(), RetPtr, DAG.getIntPtrConstant(Offsets[i])); Chains[i] = DAG.getStore(Chain, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + i), // FIXME: better loc info would be nice. Add, MachinePointerInfo(), false, false, 0); } Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, Chains); } else if (I.getNumOperands() != 0) { SmallVector ValueVTs; ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs); unsigned NumValues = ValueVTs.size(); if (NumValues) { SDValue RetOp = getValue(I.getOperand(0)); const Function *F = I.getParent()->getParent(); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; LLVMContext &Context = F->getContext(); bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg); for (unsigned j = 0; j != NumValues; ++j) { EVT VT = ValueVTs[j]; if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) VT = TLI.getTypeForExtArgOrReturn(Context, VT, ExtendKind); unsigned NumParts = TLI.getNumRegisters(Context, VT); MVT PartVT = TLI.getRegisterType(Context, VT); SmallVector Parts(NumParts); getCopyToParts(DAG, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + j), &Parts[0], NumParts, PartVT, &I, ExtendKind); // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); if (RetInReg) Flags.setInReg(); // Propagate extension type if any if (ExtendKind == ISD::SIGN_EXTEND) Flags.setSExt(); else if (ExtendKind == ISD::ZERO_EXTEND) Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) { Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(), VT, /*isfixed=*/true, 0, 0)); OutVals.push_back(Parts[i]); } } } } bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); CallingConv::ID CallConv = DAG.getMachineFunction().getFunction()->getCallingConv(); Chain = DAG.getTargetLoweringInfo().LowerReturn( Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG); // Verify that the target's LowerReturn behaved as expected. assert(Chain.getNode() && Chain.getValueType() == MVT::Other && "LowerReturn didn't return a valid chain!"); // Update the DAG with the new chain value resulting from return lowering. DAG.setRoot(Chain); } /// CopyToExportRegsIfNeeded - If the given value has virtual registers /// created for it, emit nodes to copy the value into the virtual /// registers. void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) { // Skip empty types if (V->getType()->isEmptyTy()) return; DenseMap::iterator VMI = FuncInfo.ValueMap.find(V); if (VMI != FuncInfo.ValueMap.end()) { assert(!V->use_empty() && "Unused value assigned virtual registers!"); CopyValueToVirtualRegister(V, VMI->second); } } /// ExportFromCurrentBlock - If this condition isn't known to be exported from /// the current basic block, add it to ValueMap now so that we'll get a /// CopyTo/FromReg. void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) { // No need to export constants. if (!isa(V) && !isa(V)) return; // Already exported? if (FuncInfo.isExportedInst(V)) return; unsigned Reg = FuncInfo.InitializeRegForValue(V); CopyValueToVirtualRegister(V, Reg); } bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB) { // The operands of the setcc have to be in this block. We don't know // how to export them from some other block. if (const Instruction *VI = dyn_cast(V)) { // Can export from current BB. if (VI->getParent() == FromBB) return true; // Is already exported, noop. return FuncInfo.isExportedInst(V); } // If this is an argument, we can export it if the BB is the entry block or // if it is already exported. if (isa(V)) { if (FromBB == &FromBB->getParent()->getEntryBlock()) return true; // Otherwise, can only export this if it is already exported. return FuncInfo.isExportedInst(V); } // Otherwise, constants can always be exported. return true; } /// Return branch probability calculated by BranchProbabilityInfo for IR blocks. uint32_t SelectionDAGBuilder::getEdgeWeight(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { BranchProbabilityInfo *BPI = FuncInfo.BPI; if (!BPI) return 0; const BasicBlock *SrcBB = Src->getBasicBlock(); const BasicBlock *DstBB = Dst->getBasicBlock(); return BPI->getEdgeWeight(SrcBB, DstBB); } void SelectionDAGBuilder:: addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst, uint32_t Weight /* = 0 */) { if (!Weight) Weight = getEdgeWeight(Src, Dst); Src->addSuccessor(Dst, Weight); } static bool InBlock(const Value *V, const BasicBlock *BB) { if (const Instruction *I = dyn_cast(V)) return I->getParent() == BB; return true; } /// EmitBranchForMergedCondition - Helper method for FindMergedConditions. /// This function emits a branch and is used at the leaves of an OR or an /// AND operator tree. /// void SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, uint32_t TWeight, uint32_t FWeight) { const BasicBlock *BB = CurBB->getBasicBlock(); // If the leaf of the tree is a comparison, merge the condition into // the caseblock. if (const CmpInst *BOp = dyn_cast(Cond)) { // The operands of the cmp have to be in this block. We don't know // how to export them from some other block. If this is the first block // of the sequence, no exporting is needed. if (CurBB == SwitchBB || (isExportableFromCurrentBlock(BOp->getOperand(0), BB) && isExportableFromCurrentBlock(BOp->getOperand(1), BB))) { ISD::CondCode Condition; if (const ICmpInst *IC = dyn_cast(Cond)) { Condition = getICmpCondCode(IC->getPredicate()); } else if (const FCmpInst *FC = dyn_cast(Cond)) { Condition = getFCmpCondCode(FC->getPredicate()); if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); } else { (void)Condition; // silence warning. llvm_unreachable("Unknown compare instruction"); } CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr, TBB, FBB, CurBB, TWeight, FWeight); SwitchCases.push_back(CB); return; } } // Create a CaseBlock record representing this branch. CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()), nullptr, TBB, FBB, CurBB, TWeight, FWeight); SwitchCases.push_back(CB); } /// Scale down both weights to fit into uint32_t. static void ScaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; uint32_t Scale = (NewMax / UINT32_MAX) + 1; NewTrue = NewTrue / Scale; NewFalse = NewFalse / Scale; } /// FindMergedConditions - If Cond is an expression like void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, unsigned Opc, uint32_t TWeight, uint32_t FWeight) { // If this node is not part of the or/and tree, emit it as a branch. const Instruction *BOp = dyn_cast(Cond); if (!BOp || !(isa(BOp) || isa(BOp)) || (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() || BOp->getParent() != CurBB->getBasicBlock() || !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) || !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) { EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, TWeight, FWeight); return; } // Create TmpBB after CurBB. MachineFunction::iterator BBI = CurBB; MachineFunction &MF = DAG.getMachineFunction(); MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock()); CurBB->getParent()->insert(++BBI, TmpBB); if (Opc == Instruction::Or) { // Codegen X | Y as: // BB1: // jmp_if_X TBB // jmp TmpBB // TmpBB: // jmp_if_Y TBB // jmp FBB // // We have flexibility in setting Prob for BB1 and Prob for TmpBB. // The requirement is that // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) // = TrueProb for orignal BB. // Assuming the orignal weights are A and B, one choice is to set BB1's // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice // assumes that // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. // Another choice is to assume TrueProb for BB1 equals to TrueProb for // TmpBB, but the math is more complicated. uint64_t NewTrueWeight = TWeight; uint64_t NewFalseWeight = (uint64_t)TWeight + 2 * (uint64_t)FWeight; ScaleWeights(NewTrueWeight, NewFalseWeight); // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, NewTrueWeight, NewFalseWeight); NewTrueWeight = TWeight; NewFalseWeight = 2 * (uint64_t)FWeight; ScaleWeights(NewTrueWeight, NewFalseWeight); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, NewTrueWeight, NewFalseWeight); } else { assert(Opc == Instruction::And && "Unknown merge op!"); // Codegen X & Y as: // BB1: // jmp_if_X TmpBB // jmp FBB // TmpBB: // jmp_if_Y TBB // jmp FBB // // This requires creation of TmpBB after CurBB. // We have flexibility in setting Prob for BB1 and Prob for TmpBB. // The requirement is that // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) // = FalseProb for orignal BB. // Assuming the orignal weights are A and B, one choice is to set BB1's // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice // assumes that // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. uint64_t NewTrueWeight = 2 * (uint64_t)TWeight + (uint64_t)FWeight; uint64_t NewFalseWeight = FWeight; ScaleWeights(NewTrueWeight, NewFalseWeight); // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, NewTrueWeight, NewFalseWeight); NewTrueWeight = 2 * (uint64_t)TWeight; NewFalseWeight = FWeight; ScaleWeights(NewTrueWeight, NewFalseWeight); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, NewTrueWeight, NewFalseWeight); } } /// If the set of cases should be emitted as a series of branches, return true. /// If we should emit this as a bunch of and/or'd together conditions, return /// false. bool SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector &Cases) { if (Cases.size() != 2) return true; // If this is two comparisons of the same values or'd or and'd together, they // will get folded into a single comparison, so don't emit two blocks. if ((Cases[0].CmpLHS == Cases[1].CmpLHS && Cases[0].CmpRHS == Cases[1].CmpRHS) || (Cases[0].CmpRHS == Cases[1].CmpLHS && Cases[0].CmpLHS == Cases[1].CmpRHS)) { return false; } // Handle: (X != null) | (Y != null) --> (X|Y) != 0 // Handle: (X == null) & (Y == null) --> (X|Y) == 0 if (Cases[0].CmpRHS == Cases[1].CmpRHS && Cases[0].CC == Cases[1].CC && isa(Cases[0].CmpRHS) && cast(Cases[0].CmpRHS)->isNullValue()) { if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB) return false; if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB) return false; } return true; } void SelectionDAGBuilder::visitBr(const BranchInst &I) { MachineBasicBlock *BrMBB = FuncInfo.MBB; // Update machine-CFG edges. MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)]; // Figure out which block is immediately after the current one. MachineBasicBlock *NextBlock = nullptr; MachineFunction::iterator BBI = BrMBB; if (++BBI != FuncInfo.MF->end()) NextBlock = BBI; if (I.isUnconditional()) { // Update machine-CFG edges. BrMBB->addSuccessor(Succ0MBB); // If this is not a fall-through branch or optimizations are switched off, // emit the branch. if (Succ0MBB != NextBlock || TM.getOptLevel() == CodeGenOpt::None) DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, getControlRoot(), DAG.getBasicBlock(Succ0MBB))); return; } // If this condition is one of the special cases we handle, do special stuff // now. const Value *CondVal = I.getCondition(); MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)]; // If this is a series of conditions that are or'd or and'd together, emit // this as a sequence of branches instead of setcc's with and/or operations. // As long as jumps are not expensive, this should improve performance. // For example, instead of something like: // cmp A, B // C = seteq // cmp D, E // F = setle // or C, F // jnz foo // Emit: // cmp A, B // je foo // cmp D, E // jle foo // if (const BinaryOperator *BOp = dyn_cast(CondVal)) { if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() && (BOp->getOpcode() == Instruction::And || BOp->getOpcode() == Instruction::Or)) { FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, BOp->getOpcode(), getEdgeWeight(BrMBB, Succ0MBB), getEdgeWeight(BrMBB, Succ1MBB)); // If the compares in later blocks need to use values not currently // exported from this block, export them now. This block should always // be the first entry. assert(SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!"); // Allow some cases to be rejected. if (ShouldEmitAsBranches(SwitchCases)) { for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) { ExportFromCurrentBlock(SwitchCases[i].CmpLHS); ExportFromCurrentBlock(SwitchCases[i].CmpRHS); } // Emit the branch for this block. visitSwitchCase(SwitchCases[0], BrMBB); SwitchCases.erase(SwitchCases.begin()); return; } // Okay, we decided not to do this, remove any inserted MBB's and clear // SwitchCases. for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) FuncInfo.MF->erase(SwitchCases[i].ThisBB); SwitchCases.clear(); } } // Create a CaseBlock record representing this branch. CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()), nullptr, Succ0MBB, Succ1MBB, BrMBB); // Use visitSwitchCase to actually insert the fast branch sequence for this // cond branch. visitSwitchCase(CB, BrMBB); } /// visitSwitchCase - Emits the necessary code to represent a single node in /// the binary search tree resulting from lowering a switch instruction. void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB, MachineBasicBlock *SwitchBB) { SDValue Cond; SDValue CondLHS = getValue(CB.CmpLHS); SDLoc dl = getCurSDLoc(); // Build the setcc now. if (!CB.CmpMHS) { // Fold "(X == true)" to X and "(X == false)" to !X to // handle common cases produced by branch lowering. if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) && CB.CC == ISD::SETEQ) Cond = CondLHS; else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) && CB.CC == ISD::SETEQ) { SDValue True = DAG.getConstant(1, CondLHS.getValueType()); Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True); } else Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC); } else { assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now"); const APInt& Low = cast(CB.CmpLHS)->getValue(); const APInt& High = cast(CB.CmpRHS)->getValue(); SDValue CmpOp = getValue(CB.CmpMHS); EVT VT = CmpOp.getValueType(); if (cast(CB.CmpLHS)->isMinValue(true)) { Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT), ISD::SETLE); } else { SDValue SUB = DAG.getNode(ISD::SUB, dl, VT, CmpOp, DAG.getConstant(Low, VT)); Cond = DAG.getSetCC(dl, MVT::i1, SUB, DAG.getConstant(High-Low, VT), ISD::SETULE); } } // Update successor info addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight); // TrueBB and FalseBB are always different unless the incoming IR is // degenerate. This only happens when running llc on weird IR. if (CB.TrueBB != CB.FalseBB) addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight); // Set NextBlock to be the MBB immediately after the current one, if any. // This is used to avoid emitting unnecessary branches to the next block. MachineBasicBlock *NextBlock = nullptr; MachineFunction::iterator BBI = SwitchBB; if (++BBI != FuncInfo.MF->end()) NextBlock = BBI; // If the lhs block is the next block, invert the condition so that we can // fall through to the lhs instead of the rhs block. if (CB.TrueBB == NextBlock) { std::swap(CB.TrueBB, CB.FalseBB); SDValue True = DAG.getConstant(1, Cond.getValueType()); Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True); } SDValue BrCond = DAG.getNode(ISD::BRCOND, dl, MVT::Other, getControlRoot(), Cond, DAG.getBasicBlock(CB.TrueBB)); // Insert the false branch. Do this even if it's a fall through branch, // this makes it easier to do DAG optimizations which require inverting // the branch condition. BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond, DAG.getBasicBlock(CB.FalseBB)); DAG.setRoot(BrCond); } /// visitJumpTable - Emit JumpTable node in the current MBB void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) { // Emit the code for the jump table assert(JT.Reg != -1U && "Should lower JT Header first!"); EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(); SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(), JT.Reg, PTy); SDValue Table = DAG.getJumpTable(JT.JTI, PTy); SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(), MVT::Other, Index.getValue(1), Table, Index); DAG.setRoot(BrJumpTable); } /// visitJumpTableHeader - This function emits necessary code to produce index /// in the JumpTable from switch case. void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH, MachineBasicBlock *SwitchBB) { // Subtract the lowest switch case value from the value being switched on and // conditional branch to default mbb if the result is greater than the // difference between smallest and largest cases. SDValue SwitchOp = getValue(JTH.SValue); EVT VT = SwitchOp.getValueType(); SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, SwitchOp, DAG.getConstant(JTH.First, VT)); // The SDNode we just created, which holds the value being switched on minus // the smallest case value, needs to be copied to a virtual register so it // can be used as an index into the jump table in a subsequent basic block. // This value may be smaller or larger than the target's pointer type, and // therefore require extension or truncating. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SwitchOp = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), TLI.getPointerTy()); unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy()); SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurSDLoc(), JumpTableReg, SwitchOp); JT.Reg = JumpTableReg; // Emit the range check for the jump table, and branch to the default block // for the switch statement if the value being switched on exceeds the largest // case in the switch. SDValue CMP = DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), Sub.getValueType()), Sub, DAG.getConstant(JTH.Last - JTH.First, VT), ISD::SETUGT); // Set NextBlock to be the MBB immediately after the current one, if any. // This is used to avoid emitting unnecessary branches to the next block. MachineBasicBlock *NextBlock = nullptr; MachineFunction::iterator BBI = SwitchBB; if (++BBI != FuncInfo.MF->end()) NextBlock = BBI; SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurSDLoc(), MVT::Other, CopyTo, CMP, DAG.getBasicBlock(JT.Default)); if (JT.MBB != NextBlock) BrCond = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, BrCond, DAG.getBasicBlock(JT.MBB)); DAG.setRoot(BrCond); } /// Codegen a new tail for a stack protector check ParentMBB which has had its /// tail spliced into a stack protector check success bb. /// /// For a high level explanation of how this fits into the stack protector /// generation see the comment on the declaration of class /// StackProtectorDescriptor. void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, MachineBasicBlock *ParentBB) { // First create the loads to the guard/stack slot for the comparison. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT PtrTy = TLI.getPointerTy(); MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo(); int FI = MFI->getStackProtectorIndex(); const Value *IRGuard = SPD.getGuard(); SDValue GuardPtr = getValue(IRGuard); SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy); unsigned Align = TLI.getDataLayout()->getPrefTypeAlignment(IRGuard->getType()); SDValue Guard; // If GuardReg is set and useLoadStackGuardNode returns true, retrieve the // guard value from the virtual register holding the value. Otherwise, emit a // volatile load to retrieve the stack guard value. unsigned GuardReg = SPD.getGuardReg(); if (GuardReg && TLI.useLoadStackGuardNode()) Guard = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), GuardReg, PtrTy); else Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(), GuardPtr, MachinePointerInfo(IRGuard, 0), true, false, false, Align); SDValue StackSlot = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(), StackSlotPtr, MachinePointerInfo::getFixedStack(FI), true, false, false, Align); // Perform the comparison via a subtract/getsetcc. EVT VT = Guard.getValueType(); SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, Guard, StackSlot); SDValue Cmp = DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), Sub.getValueType()), Sub, DAG.getConstant(0, VT), ISD::SETNE); // If the sub is not 0, then we know the guard/stackslot do not equal, so // branch to failure MBB. SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurSDLoc(), MVT::Other, StackSlot.getOperand(0), Cmp, DAG.getBasicBlock(SPD.getFailureMBB())); // Otherwise branch to success MBB. SDValue Br = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, BrCond, DAG.getBasicBlock(SPD.getSuccessMBB())); DAG.setRoot(Br); } /// Codegen the failure basic block for a stack protector check. /// /// A failure stack protector machine basic block consists simply of a call to /// __stack_chk_fail(). /// /// For a high level explanation of how this fits into the stack protector /// generation see the comment on the declaration of class /// StackProtectorDescriptor. void SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Chain = TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid, nullptr, 0, false, getCurSDLoc(), false, false).second; DAG.setRoot(Chain); } /// visitBitTestHeader - This function emits necessary code to produce value /// suitable for "bit tests" void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB) { // Subtract the minimum value SDValue SwitchOp = getValue(B.SValue); EVT VT = SwitchOp.getValueType(); SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, SwitchOp, DAG.getConstant(B.First, VT)); // Check range const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue RangeCmp = DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), Sub.getValueType()), Sub, DAG.getConstant(B.Range, VT), ISD::SETUGT); // Determine the type of the test operands. bool UsePtrType = false; if (!TLI.isTypeLegal(VT)) UsePtrType = true; else { for (unsigned i = 0, e = B.Cases.size(); i != e; ++i) if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) { // Switch table case range are encoded into series of masks. // Just use pointer type, it's guaranteed to fit. UsePtrType = true; break; } } if (UsePtrType) { VT = TLI.getPointerTy(); Sub = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), VT); } B.RegVT = VT.getSimpleVT(); B.Reg = FuncInfo.CreateReg(B.RegVT); SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurSDLoc(), B.Reg, Sub); // Set NextBlock to be the MBB immediately after the current one, if any. // This is used to avoid emitting unnecessary branches to the next block. MachineBasicBlock *NextBlock = nullptr; MachineFunction::iterator BBI = SwitchBB; if (++BBI != FuncInfo.MF->end()) NextBlock = BBI; MachineBasicBlock* MBB = B.Cases[0].ThisBB; addSuccessorWithWeight(SwitchBB, B.Default); addSuccessorWithWeight(SwitchBB, MBB); SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurSDLoc(), MVT::Other, CopyTo, RangeCmp, DAG.getBasicBlock(B.Default)); if (MBB != NextBlock) BrRange = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, CopyTo, DAG.getBasicBlock(MBB)); DAG.setRoot(BrRange); } /// visitBitTestCase - this function produces one "bit test" void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, MachineBasicBlock* NextMBB, uint32_t BranchWeightToNext, unsigned Reg, BitTestCase &B, MachineBasicBlock *SwitchBB) { MVT VT = BB.RegVT; SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(), Reg, VT); SDValue Cmp; unsigned PopCount = CountPopulation_64(B.Mask); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (PopCount == 1) { // Testing for a single bit; just compare the shift count with what it // would need to be to shift a 1 bit in that position. Cmp = DAG.getSetCC( getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), VT), ISD::SETEQ); } else if (PopCount == BB.Range) { // There is only one zero bit in the range, test for it directly. Cmp = DAG.getSetCC( getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp, DAG.getConstant(CountTrailingOnes_64(B.Mask), VT), ISD::SETNE); } else { // Make desired shift SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurSDLoc(), VT, DAG.getConstant(1, VT), ShiftOp); // Emit bit tests and jumps SDValue AndOp = DAG.getNode(ISD::AND, getCurSDLoc(), VT, SwitchVal, DAG.getConstant(B.Mask, VT)); Cmp = DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), AndOp, DAG.getConstant(0, VT), ISD::SETNE); } // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight. addSuccessorWithWeight(SwitchBB, B.TargetBB, B.ExtraWeight); // The branch weight from SwitchBB to NextMBB is BranchWeightToNext. addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext); SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurSDLoc(), MVT::Other, getControlRoot(), Cmp, DAG.getBasicBlock(B.TargetBB)); // Set NextBlock to be the MBB immediately after the current one, if any. // This is used to avoid emitting unnecessary branches to the next block. MachineBasicBlock *NextBlock = nullptr; MachineFunction::iterator BBI = SwitchBB; if (++BBI != FuncInfo.MF->end()) NextBlock = BBI; if (NextMBB != NextBlock) BrAnd = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, BrAnd, DAG.getBasicBlock(NextMBB)); DAG.setRoot(BrAnd); } void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { MachineBasicBlock *InvokeMBB = FuncInfo.MBB; // Retrieve successors. MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)]; MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)]; const Value *Callee(I.getCalledValue()); const Function *Fn = dyn_cast(Callee); if (isa(Callee)) visitInlineAsm(&I); else if (Fn && Fn->isIntrinsic()) { switch (Fn->getIntrinsicID()) { default: llvm_unreachable("Cannot invoke this intrinsic"); case Intrinsic::donothing: // Ignore invokes to @llvm.donothing: jump directly to the next BB. break; case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: visitPatchpoint(&I, LandingPad); break; } } else LowerCallTo(&I, getValue(Callee), false, LandingPad); // If the value of the invoke is used outside of its defining block, make it // available as a virtual register. CopyToExportRegsIfNeeded(&I); // Update successor info addSuccessorWithWeight(InvokeMBB, Return); addSuccessorWithWeight(InvokeMBB, LandingPad); // Drop into normal successor. DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, getControlRoot(), DAG.getBasicBlock(Return))); } void SelectionDAGBuilder::visitResume(const ResumeInst &RI) { llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!"); } void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) { assert(FuncInfo.MBB->isLandingPad() && "Call to landingpad not in landing pad!"); MachineBasicBlock *MBB = FuncInfo.MBB; MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI(); AddLandingPadInfo(LP, MMI, MBB); // If there aren't registers to copy the values into (e.g., during SjLj // exceptions), then don't bother to create these DAG nodes. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.getExceptionPointerRegister() == 0 && TLI.getExceptionSelectorRegister() == 0) return; SmallVector ValueVTs; ComputeValueVTs(TLI, LP.getType(), ValueVTs); assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported"); // Get the two live-in registers as SDValues. The physregs have already been // copied into virtual registers. SDValue Ops[2]; Ops[0] = DAG.getZExtOrTrunc( DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()), getCurSDLoc(), ValueVTs[0]); Ops[1] = DAG.getZExtOrTrunc( DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), FuncInfo.ExceptionSelectorVirtReg, TLI.getPointerTy()), getCurSDLoc(), ValueVTs[1]); // Merge into one. SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(ValueVTs), Ops); setValue(&LP, Res); } /// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for /// small case ranges). bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR, CaseRecVector& WorkList, const Value* SV, MachineBasicBlock *Default, MachineBasicBlock *SwitchBB) { // Size is the number of Cases represented by this range. size_t Size = CR.Range.second - CR.Range.first; if (Size > 3) return false; // Get the MachineFunction which holds the current MBB. This is used when // inserting any additional MBBs necessary to represent the switch. MachineFunction *CurMF = FuncInfo.MF; // Figure out which block is immediately after the current one. MachineBasicBlock *NextBlock = nullptr; MachineFunction::iterator BBI = CR.CaseBB; if (++BBI != FuncInfo.MF->end()) NextBlock = BBI; BranchProbabilityInfo *BPI = FuncInfo.BPI; // If any two of the cases has the same destination, and if one value // is the same as the other, but has one bit unset that the other has set, // use bit manipulation to do two compares at once. For example: // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)" // TODO: This could be extended to merge any 2 cases in switches with 3 cases. // TODO: Handle cases where CR.CaseBB != SwitchBB. if (Size == 2 && CR.CaseBB == SwitchBB) { Case &Small = *CR.Range.first; Case &Big = *(CR.Range.second-1); if (Small.Low == Small.High && Big.Low == Big.High && Small.BB == Big.BB) { const APInt& SmallValue = cast(Small.Low)->getValue(); const APInt& BigValue = cast(Big.Low)->getValue(); // Check that there is only one bit different. if (BigValue.countPopulation() == SmallValue.countPopulation() + 1 && (SmallValue | BigValue) == BigValue) { // Isolate the common bit. APInt CommonBit = BigValue & ~SmallValue; assert((SmallValue | CommonBit) == BigValue && CommonBit.countPopulation() == 1 && "Not a common bit?"); SDValue CondLHS = getValue(SV); EVT VT = CondLHS.getValueType(); SDLoc DL = getCurSDLoc(); SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS, DAG.getConstant(CommonBit, VT)); SDValue Cond = DAG.getSetCC(DL, MVT::i1, Or, DAG.getConstant(BigValue, VT), ISD::SETEQ); // Update successor info. // Both Small and Big will jump to Small.BB, so we sum up the weights. addSuccessorWithWeight(SwitchBB, Small.BB, Small.ExtraWeight + Big.ExtraWeight); addSuccessorWithWeight(SwitchBB, Default, // The default destination is the first successor in IR. BPI ? BPI->getEdgeWeight(SwitchBB->getBasicBlock(), (unsigned)0) : 0); // Insert the true branch. SDValue BrCond = DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond, DAG.getBasicBlock(Small.BB)); // Insert the false branch. BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond, DAG.getBasicBlock(Default)); DAG.setRoot(BrCond); return true; } } } // Order cases by weight so the most likely case will be checked first. uint32_t UnhandledWeights = 0; if (BPI) { for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) { uint32_t IWeight = I->ExtraWeight; UnhandledWeights += IWeight; for (CaseItr J = CR.Range.first; J < I; ++J) { uint32_t JWeight = J->ExtraWeight; if (IWeight > JWeight) std::swap(*I, *J); } } } // Rearrange the case blocks so that the last one falls through if possible. Case &BackCase = *(CR.Range.second-1); if (Size > 1 && NextBlock && Default != NextBlock && BackCase.BB != NextBlock) { // The last case block won't fall through into 'NextBlock' if we emit the // branches in this order. See if rearranging a case value would help. // We start at the bottom as it's the case with the least weight. for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I) if (I->BB == NextBlock) { std::swap(*I, BackCase); break; } } // Create a CaseBlock record representing a conditional branch to // the Case's target mbb if the value being switched on SV is equal // to C. MachineBasicBlock *CurBlock = CR.CaseBB; for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) { MachineBasicBlock *FallThrough; if (I != E-1) { FallThrough = CurMF->CreateMachineBasicBlock(CurBlock->getBasicBlock()); CurMF->insert(BBI, FallThrough); // Put SV in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(SV); } else { // If the last case doesn't match, go to the default block. FallThrough = Default; } const Value *RHS, *LHS, *MHS; ISD::CondCode CC; if (I->High == I->Low) { // This is just small small case range :) containing exactly 1 case CC = ISD::SETEQ; LHS = SV; RHS = I->High; MHS = nullptr; } else { CC = ISD::SETLE; LHS = I->Low; MHS = SV; RHS = I->High; } // The false weight should be sum of all un-handled cases. UnhandledWeights -= I->ExtraWeight; CaseBlock CB(CC, LHS, RHS, MHS, /* truebb */ I->BB, /* falsebb */ FallThrough, /* me */ CurBlock, /* trueweight */ I->ExtraWeight, /* falseweight */ UnhandledWeights); // If emitting the first comparison, just call visitSwitchCase to emit the // code into the current block. Otherwise, push the CaseBlock onto the // vector to be later processed by SDISel, and insert the node's MBB // before the next MBB. if (CurBlock == SwitchBB) visitSwitchCase(CB, SwitchBB); else SwitchCases.push_back(CB); CurBlock = FallThrough; } return true; } static inline bool areJTsAllowed(const TargetLowering &TLI) { return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other); } static APInt ComputeRange(const APInt &First, const APInt &Last) { uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1; APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth); return (LastExt - FirstExt + 1ULL); } /// handleJTSwitchCase - Emit jumptable for current switch case range bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR, CaseRecVector &WorkList, const Value *SV, MachineBasicBlock *Default, MachineBasicBlock *SwitchBB) { Case& FrontCase = *CR.Range.first; Case& BackCase = *(CR.Range.second-1); const APInt &First = cast(FrontCase.Low)->getValue(); const APInt &Last = cast(BackCase.High)->getValue(); APInt TSize(First.getBitWidth(), 0); for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) TSize += I->size(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!areJTsAllowed(TLI) || TSize.ult(TLI.getMinimumJumpTableEntries())) return false; APInt Range = ComputeRange(First, Last); // The density is TSize / Range. Require at least 40%. // It should not be possible for IntTSize to saturate for sane code, but make // sure we handle Range saturation correctly. uint64_t IntRange = Range.getLimitedValue(UINT64_MAX/10); uint64_t IntTSize = TSize.getLimitedValue(UINT64_MAX/10); if (IntTSize * 10 < IntRange * 4) return false; DEBUG(dbgs() << "Lowering jump table\n" << "First entry: " << First << ". Last entry: " << Last << '\n' << "Range: " << Range << ". Size: " << TSize << ".\n\n"); // Get the MachineFunction which holds the current MBB. This is used when // inserting any additional MBBs necessary to represent the switch. MachineFunction *CurMF = FuncInfo.MF; // Figure out which block is immediately after the current one. MachineFunction::iterator BBI = CR.CaseBB; ++BBI; const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock(); // Create a new basic block to hold the code for loading the address // of the jump table, and jumping to it. Update successor information; // we will either branch to the default case for the switch, or the jump // table. MachineBasicBlock *JumpTableBB = CurMF->CreateMachineBasicBlock(LLVMBB); CurMF->insert(BBI, JumpTableBB); addSuccessorWithWeight(CR.CaseBB, Default); addSuccessorWithWeight(CR.CaseBB, JumpTableBB); // Build a vector of destination BBs, corresponding to each target // of the jump table. If the value of the jump table slot corresponds to // a case statement, push the case's BB onto the vector, otherwise, push // the default BB. std::vector DestBBs; APInt TEI = First; for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++TEI) { const APInt &Low = cast(I->Low)->getValue(); const APInt &High = cast(I->High)->getValue(); if (Low.sle(TEI) && TEI.sle(High)) { DestBBs.push_back(I->BB); if (TEI==High) ++I; } else { DestBBs.push_back(Default); } } // Calculate weight for each unique destination in CR. DenseMap DestWeights; if (FuncInfo.BPI) for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) { DenseMap::iterator Itr = DestWeights.find(I->BB); if (Itr != DestWeights.end()) Itr->second += I->ExtraWeight; else DestWeights[I->BB] = I->ExtraWeight; } // Update successor info. Add one edge to each unique successor. BitVector SuccsHandled(CR.CaseBB->getParent()->getNumBlockIDs()); for (std::vector::iterator I = DestBBs.begin(), E = DestBBs.end(); I != E; ++I) { if (!SuccsHandled[(*I)->getNumber()]) { SuccsHandled[(*I)->getNumber()] = true; DenseMap::iterator Itr = DestWeights.find(*I); addSuccessorWithWeight(JumpTableBB, *I, Itr != DestWeights.end() ? Itr->second : 0); } } // Create a jump table index for this jump table. unsigned JTEncoding = TLI.getJumpTableEncoding(); unsigned JTI = CurMF->getOrCreateJumpTableInfo(JTEncoding) ->createJumpTableIndex(DestBBs); // Set the jump table information so that we can codegen it as a second // MachineBasicBlock JumpTable JT(-1U, JTI, JumpTableBB, Default); JumpTableHeader JTH(First, Last, SV, CR.CaseBB, (CR.CaseBB == SwitchBB)); if (CR.CaseBB == SwitchBB) visitJumpTableHeader(JT, JTH, SwitchBB); JTCases.push_back(JumpTableBlock(JTH, JT)); return true; } /// handleBTSplitSwitchCase - emit comparison and split binary search tree into /// 2 subtrees. bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR, CaseRecVector& WorkList, const Value* SV, MachineBasicBlock* SwitchBB) { // Get the MachineFunction which holds the current MBB. This is used when // inserting any additional MBBs necessary to represent the switch. MachineFunction *CurMF = FuncInfo.MF; // Figure out which block is immediately after the current one. MachineFunction::iterator BBI = CR.CaseBB; ++BBI; Case& FrontCase = *CR.Range.first; Case& BackCase = *(CR.Range.second-1); const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock(); // Size is the number of Cases represented by this range. unsigned Size = CR.Range.second - CR.Range.first; const APInt &First = cast(FrontCase.Low)->getValue(); const APInt &Last = cast(BackCase.High)->getValue(); double FMetric = 0; CaseItr Pivot = CR.Range.first + Size/2; // Select optimal pivot, maximizing sum density of LHS and RHS. This will // (heuristically) allow us to emit JumpTable's later. APInt TSize(First.getBitWidth(), 0); for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) TSize += I->size(); APInt LSize = FrontCase.size(); APInt RSize = TSize-LSize; DEBUG(dbgs() << "Selecting best pivot: \n" << "First: " << First << ", Last: " << Last <<'\n' << "LSize: " << LSize << ", RSize: " << RSize << '\n'); for (CaseItr I = CR.Range.first, J=I+1, E = CR.Range.second; J!=E; ++I, ++J) { const APInt &LEnd = cast(I->High)->getValue(); const APInt &RBegin = cast(J->Low)->getValue(); APInt Range = ComputeRange(LEnd, RBegin); assert((Range - 2ULL).isNonNegative() && "Invalid case distance"); // Use volatile double here to avoid excess precision issues on some hosts, // e.g. that use 80-bit X87 registers. volatile double LDensity = (double)LSize.roundToDouble() / (LEnd - First + 1ULL).roundToDouble(); volatile double RDensity = (double)RSize.roundToDouble() / (Last - RBegin + 1ULL).roundToDouble(); volatile double Metric = Range.logBase2()*(LDensity+RDensity); // Should always split in some non-trivial place DEBUG(dbgs() <<"=>Step\n" << "LEnd: " << LEnd << ", RBegin: " << RBegin << '\n' << "LDensity: " << LDensity << ", RDensity: " << RDensity << '\n' << "Metric: " << Metric << '\n'); if (FMetric < Metric) { Pivot = J; FMetric = Metric; DEBUG(dbgs() << "Current metric set to: " << FMetric << '\n'); } LSize += J->size(); RSize -= J->size(); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (areJTsAllowed(TLI)) { // If our case is dense we *really* should handle it earlier! assert((FMetric > 0) && "Should handle dense range earlier!"); } else { Pivot = CR.Range.first + Size/2; } CaseRange LHSR(CR.Range.first, Pivot); CaseRange RHSR(Pivot, CR.Range.second); const Constant *C = Pivot->Low; MachineBasicBlock *FalseBB = nullptr, *TrueBB = nullptr; // We know that we branch to the LHS if the Value being switched on is // less than the Pivot value, C. We use this to optimize our binary // tree a bit, by recognizing that if SV is greater than or equal to the // LHS's Case Value, and that Case Value is exactly one less than the // Pivot's Value, then we can branch directly to the LHS's Target, // rather than creating a leaf node for it. if ((LHSR.second - LHSR.first) == 1 && LHSR.first->High == CR.GE && cast(C)->getValue() == (cast(CR.GE)->getValue() + 1LL)) { TrueBB = LHSR.first->BB; } else { TrueBB = CurMF->CreateMachineBasicBlock(LLVMBB); CurMF->insert(BBI, TrueBB); WorkList.push_back(CaseRec(TrueBB, C, CR.GE, LHSR)); // Put SV in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(SV); } // Similar to the optimization above, if the Value being switched on is // known to be less than the Constant CR.LT, and the current Case Value // is CR.LT - 1, then we can branch directly to the target block for // the current Case Value, rather than emitting a RHS leaf node for it. if ((RHSR.second - RHSR.first) == 1 && CR.LT && cast(RHSR.first->Low)->getValue() == (cast(CR.LT)->getValue() - 1LL)) { FalseBB = RHSR.first->BB; } else { FalseBB = CurMF->CreateMachineBasicBlock(LLVMBB); CurMF->insert(BBI, FalseBB); WorkList.push_back(CaseRec(FalseBB,CR.LT,C,RHSR)); // Put SV in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(SV); } // Create a CaseBlock record representing a conditional branch to // the LHS node if the value being switched on SV is less than C. // Otherwise, branch to LHS. CaseBlock CB(ISD::SETLT, SV, C, nullptr, TrueBB, FalseBB, CR.CaseBB); if (CR.CaseBB == SwitchBB) visitSwitchCase(CB, SwitchBB); else SwitchCases.push_back(CB); return true; } /// handleBitTestsSwitchCase - if current case range has few destination and /// range span less, than machine word bitwidth, encode case range into series /// of masks and emit bit tests with these masks. bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR, CaseRecVector& WorkList, const Value* SV, MachineBasicBlock* Default, MachineBasicBlock* SwitchBB) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT PTy = TLI.getPointerTy(); unsigned IntPtrBits = PTy.getSizeInBits(); Case& FrontCase = *CR.Range.first; Case& BackCase = *(CR.Range.second-1); // Get the MachineFunction which holds the current MBB. This is used when // inserting any additional MBBs necessary to represent the switch. MachineFunction *CurMF = FuncInfo.MF; // If target does not have legal shift left, do not emit bit tests at all. if (!TLI.isOperationLegal(ISD::SHL, PTy)) return false; size_t numCmps = 0; for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) { // Single case counts one, case range - two. numCmps += (I->Low == I->High ? 1 : 2); } // Count unique destinations SmallSet Dests; for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) { Dests.insert(I->BB); if (Dests.size() > 3) // Don't bother the code below, if there are too much unique destinations return false; } DEBUG(dbgs() << "Total number of unique destinations: " << Dests.size() << '\n' << "Total number of comparisons: " << numCmps << '\n'); // Compute span of values. const APInt& minValue = cast(FrontCase.Low)->getValue(); const APInt& maxValue = cast(BackCase.High)->getValue(); APInt cmpRange = maxValue - minValue; DEBUG(dbgs() << "Compare range: " << cmpRange << '\n' << "Low bound: " << minValue << '\n' << "High bound: " << maxValue << '\n'); if (cmpRange.uge(IntPtrBits) || (!(Dests.size() == 1 && numCmps >= 3) && !(Dests.size() == 2 && numCmps >= 5) && !(Dests.size() >= 3 && numCmps >= 6))) return false; DEBUG(dbgs() << "Emitting bit tests\n"); APInt lowBound = APInt::getNullValue(cmpRange.getBitWidth()); // Optimize the case where all the case values fit in a // word without having to subtract minValue. In this case, // we can optimize away the subtraction. if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) { cmpRange = maxValue; } else { lowBound = minValue; } CaseBitsVector CasesBits; unsigned i, count = 0; for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) { MachineBasicBlock* Dest = I->BB; for (i = 0; i < count; ++i) if (Dest == CasesBits[i].BB) break; if (i == count) { assert((count < 3) && "Too much destinations to test!"); CasesBits.push_back(CaseBits(0, Dest, 0, 0/*Weight*/)); count++; } const APInt& lowValue = cast(I->Low)->getValue(); const APInt& highValue = cast(I->High)->getValue(); uint64_t lo = (lowValue - lowBound).getZExtValue(); uint64_t hi = (highValue - lowBound).getZExtValue(); CasesBits[i].ExtraWeight += I->ExtraWeight; for (uint64_t j = lo; j <= hi; j++) { CasesBits[i].Mask |= 1ULL << j; CasesBits[i].Bits++; } } std::sort(CasesBits.begin(), CasesBits.end(), CaseBitsCmp()); BitTestInfo BTC; // Figure out which block is immediately after the current one. MachineFunction::iterator BBI = CR.CaseBB; ++BBI; const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock(); DEBUG(dbgs() << "Cases:\n"); for (unsigned i = 0, e = CasesBits.size(); i!=e; ++i) { DEBUG(dbgs() << "Mask: " << CasesBits[i].Mask << ", Bits: " << CasesBits[i].Bits << ", BB: " << CasesBits[i].BB << '\n'); MachineBasicBlock *CaseBB = CurMF->CreateMachineBasicBlock(LLVMBB); CurMF->insert(BBI, CaseBB); BTC.push_back(BitTestCase(CasesBits[i].Mask, CaseBB, CasesBits[i].BB, CasesBits[i].ExtraWeight)); // Put SV in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(SV); } BitTestBlock BTB(lowBound, cmpRange, SV, -1U, MVT::Other, (CR.CaseBB == SwitchBB), CR.CaseBB, Default, std::move(BTC)); if (CR.CaseBB == SwitchBB) visitBitTestHeader(BTB, SwitchBB); BitTestCases.push_back(std::move(BTB)); return true; } /// Clusterify - Transform simple list of Cases into list of CaseRange's void SelectionDAGBuilder::Clusterify(CaseVector& Cases, const SwitchInst& SI) { BranchProbabilityInfo *BPI = FuncInfo.BPI; // Start with "simple" cases. for (SwitchInst::ConstCaseIt i : SI.cases()) { const BasicBlock *SuccBB = i.getCaseSuccessor(); MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB]; uint32_t ExtraWeight = BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0; Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(), SMBB, ExtraWeight)); } std::sort(Cases.begin(), Cases.end(), CaseCmp()); // Merge case into clusters if (Cases.size() >= 2) // Must recompute end() each iteration because it may be // invalidated by erase if we hold on to it for (CaseItr I = Cases.begin(), J = std::next(Cases.begin()); J != Cases.end(); ) { const APInt& nextValue = cast(J->Low)->getValue(); const APInt& currentValue = cast(I->High)->getValue(); MachineBasicBlock* nextBB = J->BB; MachineBasicBlock* currentBB = I->BB; // If the two neighboring cases go to the same destination, merge them // into a single case. if ((nextValue - currentValue == 1) && (currentBB == nextBB)) { I->High = J->High; I->ExtraWeight += J->ExtraWeight; J = Cases.erase(J); } else { I = J++; } } DEBUG({ size_t numCmps = 0; for (auto &I : Cases) // A range counts double, since it requires two compares. numCmps += I.Low != I.High ? 2 : 1; dbgs() << "Clusterify finished. Total clusters: " << Cases.size() << ". Total compares: " << numCmps << '\n'; }); } void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last) { // Update JTCases. for (unsigned i = 0, e = JTCases.size(); i != e; ++i) if (JTCases[i].first.HeaderBB == First) JTCases[i].first.HeaderBB = Last; // Update BitTestCases. for (unsigned i = 0, e = BitTestCases.size(); i != e; ++i) if (BitTestCases[i].Parent == First) BitTestCases[i].Parent = Last; } void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) { MachineBasicBlock *SwitchMBB = FuncInfo.MBB; // Figure out which block is immediately after the current one. MachineBasicBlock *NextBlock = nullptr; if (SwitchMBB + 1 != FuncInfo.MF->end()) NextBlock = SwitchMBB + 1; // Create a vector of Cases, sorted so that we can efficiently create a binary // search tree from them. CaseVector Cases; Clusterify(Cases, SI); // Get the default destination MBB. MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()]; if (isa(SI.getDefaultDest()->getFirstNonPHIOrDbg()) && !Cases.empty()) { // Replace an unreachable default destination with the most popular case // destination. DenseMap Popularity; unsigned MaxPop = 0; const BasicBlock *MaxBB = nullptr; for (auto I : SI.cases()) { const BasicBlock *BB = I.getCaseSuccessor(); if (++Popularity[BB] > MaxPop) { MaxPop = Popularity[BB]; MaxBB = BB; } } // Set new default. assert(MaxPop > 0); assert(MaxBB); Default = FuncInfo.MBBMap[MaxBB]; // Remove cases that were pointing to the destination that is now the default. Cases.erase(std::remove_if(Cases.begin(), Cases.end(), [&](const Case &C) { return C.BB == Default; }), Cases.end()); } // If there is only the default destination, go there directly. if (Cases.empty()) { // Update machine-CFG edges. SwitchMBB->addSuccessor(Default); // If this is not a fall-through branch, emit the branch. if (Default != NextBlock) { DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, getControlRoot(), DAG.getBasicBlock(Default))); } return; } // Get the Value to be switched on. const Value *SV = SI.getCondition(); // Push the initial CaseRec onto the worklist CaseRecVector WorkList; WorkList.push_back(CaseRec(SwitchMBB,nullptr,nullptr, CaseRange(Cases.begin(),Cases.end()))); while (!WorkList.empty()) { // Grab a record representing a case range to process off the worklist CaseRec CR = WorkList.back(); WorkList.pop_back(); if (handleBitTestsSwitchCase(CR, WorkList, SV, Default, SwitchMBB)) continue; // If the range has few cases (two or less) emit a series of specific // tests. if (handleSmallSwitchRange(CR, WorkList, SV, Default, SwitchMBB)) continue; // If the switch has more than N blocks, and is at least 40% dense, and the // target supports indirect branches, then emit a jump table rather than // lowering the switch to a binary tree of conditional branches. // N defaults to 4 and is controlled via TLS.getMinimumJumpTableEntries(). if (handleJTSwitchCase(CR, WorkList, SV, Default, SwitchMBB)) continue; // Emit binary tree. We need to pick a pivot, and push left and right ranges // onto the worklist. Leafs are handled via handleSmallSwitchRange() call. handleBTSplitSwitchCase(CR, WorkList, SV, SwitchMBB); } } void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB; // Update machine-CFG edges with unique successors. SmallSet Done; for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) { BasicBlock *BB = I.getSuccessor(i); bool Inserted = Done.insert(BB).second; if (!Inserted) continue; MachineBasicBlock *Succ = FuncInfo.MBBMap[BB]; addSuccessorWithWeight(IndirectBrMBB, Succ); } DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(), MVT::Other, getControlRoot(), getValue(I.getAddress()))); } void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) { if (DAG.getTarget().Options.TrapUnreachable) DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); } void SelectionDAGBuilder::visitFSub(const User &I) { // -0.0 - X --> fneg Type *Ty = I.getType(); if (isa(I.getOperand(0)) && I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) { SDValue Op2 = getValue(I.getOperand(1)); setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(), Op2.getValueType(), Op2)); return; } visitBinary(I, ISD::FSUB); } void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) { SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); bool nuw = false; bool nsw = false; bool exact = false; if (const OverflowingBinaryOperator *OFBinOp = dyn_cast(&I)) { nuw = OFBinOp->hasNoUnsignedWrap(); nsw = OFBinOp->hasNoSignedWrap(); } if (const PossiblyExactOperator *ExactOp = dyn_cast(&I)) exact = ExactOp->isExact(); SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, nuw, nsw, exact); setValue(&I, BinNodeValue); } void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) { SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(Op2.getValueType()); // Coerce the shift amount to the right type if we can. if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) { unsigned ShiftSize = ShiftTy.getSizeInBits(); unsigned Op2Size = Op2.getValueType().getSizeInBits(); SDLoc DL = getCurSDLoc(); // If the operand is smaller than the shift count type, promote it. if (ShiftSize > Op2Size) Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2); // If the operand is larger than the shift count type but the shift // count type has enough bits to represent any shift value, truncate // it now. This is a common case and it exposes the truncate to // optimization early. else if (ShiftSize >= Log2_32_Ceil(Op2.getValueType().getSizeInBits())) Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2); // Otherwise we'll need to temporarily settle for some other convenient // type. Type legalization will make adjustments once the shiftee is split. else Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32); } bool nuw = false; bool nsw = false; bool exact = false; if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) { if (const OverflowingBinaryOperator *OFBinOp = dyn_cast(&I)) { nuw = OFBinOp->hasNoUnsignedWrap(); nsw = OFBinOp->hasNoSignedWrap(); } if (const PossiblyExactOperator *ExactOp = dyn_cast(&I)) exact = ExactOp->isExact(); } SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, nuw, nsw, exact); setValue(&I, Res); } void SelectionDAGBuilder::visitSDiv(const User &I) { SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); // Turn exact SDivs into multiplications. // FIXME: This should be in DAGCombiner, but it doesn't have access to the // exact bit. if (isa(&I) && cast(&I)->isExact() && !isa(Op1) && isa(Op2) && !cast(Op2)->isNullValue()) setValue(&I, DAG.getTargetLoweringInfo() .BuildExactSDIV(Op1, Op2, getCurSDLoc(), DAG)); else setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1, Op2)); } void SelectionDAGBuilder::visitICmp(const User &I) { ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE; if (const ICmpInst *IC = dyn_cast(&I)) predicate = IC->getPredicate(); else if (const ConstantExpr *IC = dyn_cast(&I)) predicate = ICmpInst::Predicate(IC->getPredicate()); SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); ISD::CondCode Opcode = getICmpCondCode(predicate); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode)); } void SelectionDAGBuilder::visitFCmp(const User &I) { FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE; if (const FCmpInst *FC = dyn_cast(&I)) predicate = FC->getPredicate(); else if (const ConstantExpr *FC = dyn_cast(&I)) predicate = FCmpInst::Predicate(FC->getPredicate()); SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); ISD::CondCode Condition = getFCmpCondCode(predicate); if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition)); } void SelectionDAGBuilder::visitSelect(const User &I) { SmallVector ValueVTs; ComputeValueVTs(DAG.getTargetLoweringInfo(), I.getType(), ValueVTs); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; SmallVector Values(NumValues); SDValue Cond = getValue(I.getOperand(0)); SDValue TrueVal = getValue(I.getOperand(1)); SDValue FalseVal = getValue(I.getOperand(2)); ISD::NodeType OpCode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT; for (unsigned i = 0; i != NumValues; ++i) Values[i] = DAG.getNode(OpCode, getCurSDLoc(), TrueVal.getNode()->getValueType(TrueVal.getResNo()+i), Cond, SDValue(TrueVal.getNode(), TrueVal.getResNo() + i), SDValue(FalseVal.getNode(), FalseVal.getResNo() + i)); setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(ValueVTs), Values)); } void SelectionDAGBuilder::visitTrunc(const User &I) { // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest). SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitZExt(const User &I) { // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest). // ZExt also can't be a cast to bool for same reason. So, nothing much to do SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitSExt(const User &I) { // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest). // SExt also can't be a cast to bool for same reason. So, nothing much to do SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitFPTrunc(const User &I) { // FPTrunc is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT DestVT = TLI.getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurSDLoc(), DestVT, N, DAG.getTargetConstant(0, TLI.getPointerTy()))); } void SelectionDAGBuilder::visitFPExt(const User &I) { // FPExt is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitFPToUI(const User &I) { // FPToUI is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitFPToSI(const User &I) { // FPToSI is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitUIToFP(const User &I) { // UIToFP is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitSIToFP(const User &I) { // SIToFP is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N)); } void SelectionDAGBuilder::visitPtrToInt(const User &I) { // What to do depends on the size of the integer and the size of the pointer. // We can either truncate, zero extend, or no-op, accordingly. SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT)); } void SelectionDAGBuilder::visitIntToPtr(const User &I) { // What to do depends on the size of the integer and the size of the pointer. // We can either truncate, zero extend, or no-op, accordingly. SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT)); } void SelectionDAGBuilder::visitBitCast(const User &I) { SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType()); // BitCast assures us that source and destination are the same size so this is // either a BITCAST or a no-op. if (DestVT != N.getValueType()) setValue(&I, DAG.getNode(ISD::BITCAST, getCurSDLoc(), DestVT, N)); // convert types. // Check if the original LLVM IR Operand was a ConstantInt, because getValue() // might fold any kind of constant expression to an integer constant and that // is not what we are looking for. Only regcognize a bitcast of a genuine // constant integer as an opaque constant. else if(ConstantInt *C = dyn_cast(I.getOperand(0))) setValue(&I, DAG.getConstant(C->getValue(), DestVT, /*isTarget=*/false, /*isOpaque*/true)); else setValue(&I, N); // noop cast. } void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const Value *SV = I.getOperand(0); SDValue N = getValue(SV); EVT DestVT = TLI.getValueType(I.getType()); unsigned SrcAS = SV->getType()->getPointerAddressSpace(); unsigned DestAS = I.getType()->getPointerAddressSpace(); if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS); setValue(&I, N); } void SelectionDAGBuilder::visitInsertElement(const User &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue InVec = getValue(I.getOperand(0)); SDValue InVal = getValue(I.getOperand(1)); SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(), TLI.getVectorIdxTy()); setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(), TLI.getValueType(I.getType()), InVec, InVal, InIdx)); } void SelectionDAGBuilder::visitExtractElement(const User &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue InVec = getValue(I.getOperand(0)); SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(), TLI.getVectorIdxTy()); setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(), TLI.getValueType(I.getType()), InVec, InIdx)); } // Utility for visitShuffleVector - Return true if every element in Mask, // beginning from position Pos and ending in Pos+Size, falls within the // specified sequential range [L, L+Pos). or is undef. static bool isSequentialInRange(const SmallVectorImpl &Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (Mask[i] >= 0 && Mask[i] != Low) return false; return true; } void SelectionDAGBuilder::visitShuffleVector(const User &I) { SDValue Src1 = getValue(I.getOperand(0)); SDValue Src2 = getValue(I.getOperand(1)); SmallVector Mask; ShuffleVectorInst::getShuffleMask(cast(I.getOperand(2)), Mask); unsigned MaskNumElts = Mask.size(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(I.getType()); EVT SrcVT = Src1.getValueType(); unsigned SrcNumElts = SrcVT.getVectorNumElements(); if (SrcNumElts == MaskNumElts) { setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2, &Mask[0])); return; } // Normalize the shuffle vector since mask and vector length don't match. if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) { // Mask is longer than the source vectors and is a multiple of the source // vectors. We can use concatenate vector to make the mask and vectors // lengths match. if (SrcNumElts*2 == MaskNumElts) { // First check for Src1 in low and Src2 in high if (isSequentialInRange(Mask, 0, SrcNumElts, 0) && isSequentialInRange(Mask, SrcNumElts, SrcNumElts, SrcNumElts)) { // The shuffle is concatenating two vectors together. setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(), VT, Src1, Src2)); return; } // Then check for Src2 in low and Src1 in high if (isSequentialInRange(Mask, 0, SrcNumElts, SrcNumElts) && isSequentialInRange(Mask, SrcNumElts, SrcNumElts, 0)) { // The shuffle is concatenating two vectors together. setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(), VT, Src2, Src1)); return; } } // Pad both vectors with undefs to make them the same length as the mask. unsigned NumConcat = MaskNumElts / SrcNumElts; bool Src1U = Src1.getOpcode() == ISD::UNDEF; bool Src2U = Src2.getOpcode() == ISD::UNDEF; SDValue UndefVal = DAG.getUNDEF(SrcVT); SmallVector MOps1(NumConcat, UndefVal); SmallVector MOps2(NumConcat, UndefVal); MOps1[0] = Src1; MOps2[0] = Src2; Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(), VT, MOps1); Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(), VT, MOps2); // Readjust mask for new input vector length. SmallVector MappedOps; for (unsigned i = 0; i != MaskNumElts; ++i) { int Idx = Mask[i]; if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts - MaskNumElts; MappedOps.push_back(Idx); } setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2, &MappedOps[0])); return; } if (SrcNumElts > MaskNumElts) { // Analyze the access pattern of the vector to see if we can extract // two subvectors and do the shuffle. The analysis is done by calculating // the range of elements the mask access on both vectors. int MinRange[2] = { static_cast(SrcNumElts), static_cast(SrcNumElts)}; int MaxRange[2] = {-1, -1}; for (unsigned i = 0; i != MaskNumElts; ++i) { int Idx = Mask[i]; unsigned Input = 0; if (Idx < 0) continue; if (Idx >= (int)SrcNumElts) { Input = 1; Idx -= SrcNumElts; } if (Idx > MaxRange[Input]) MaxRange[Input] = Idx; if (Idx < MinRange[Input]) MinRange[Input] = Idx; } // Check if the access is smaller than the vector size and can we find // a reasonable extract index. int RangeUse[2] = { -1, -1 }; // 0 = Unused, 1 = Extract, -1 = Can not // Extract. int StartIdx[2]; // StartIdx to extract from for (unsigned Input = 0; Input < 2; ++Input) { if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) { RangeUse[Input] = 0; // Unused StartIdx[Input] = 0; continue; } // Find a good start index that is a multiple of the mask length. Then // see if the rest of the elements are in range. StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts; if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts && StartIdx[Input] + MaskNumElts <= SrcNumElts) RangeUse[Input] = 1; // Extract from a multiple of the mask length. } if (RangeUse[0] == 0 && RangeUse[1] == 0) { setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used. return; } if (RangeUse[0] >= 0 && RangeUse[1] >= 0) { // Extract appropriate subvector and generate a vector shuffle for (unsigned Input = 0; Input < 2; ++Input) { SDValue &Src = Input == 0 ? Src1 : Src2; if (RangeUse[Input] == 0) Src = DAG.getUNDEF(VT); else Src = DAG.getNode( ISD::EXTRACT_SUBVECTOR, getCurSDLoc(), VT, Src, DAG.getConstant(StartIdx[Input], TLI.getVectorIdxTy())); } // Calculate new mask. SmallVector MappedOps; for (unsigned i = 0; i != MaskNumElts; ++i) { int Idx = Mask[i]; if (Idx >= 0) { if (Idx < (int)SrcNumElts) Idx -= StartIdx[0]; else Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; } MappedOps.push_back(Idx); } setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2, &MappedOps[0])); return; } } // We can't use either concat vectors or extract subvectors so fall back to // replacing the shuffle with extract and build vector. // to insert and build vector. EVT EltVT = VT.getVectorElementType(); EVT IdxVT = TLI.getVectorIdxTy(); SmallVector Ops; for (unsigned i = 0; i != MaskNumElts; ++i) { int Idx = Mask[i]; SDValue Res; if (Idx < 0) { Res = DAG.getUNDEF(EltVT); } else { SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2; if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts; Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(), EltVT, Src, DAG.getConstant(Idx, IdxVT)); } Ops.push_back(Res); } setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops)); } void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { const Value *Op0 = I.getOperand(0); const Value *Op1 = I.getOperand(1); Type *AggTy = I.getType(); Type *ValTy = Op1->getType(); bool IntoUndef = isa(Op0); bool FromUndef = isa(Op1); unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector AggValueVTs; ComputeValueVTs(TLI, AggTy, AggValueVTs); SmallVector ValValueVTs; ComputeValueVTs(TLI, ValTy, ValValueVTs); unsigned NumAggValues = AggValueVTs.size(); unsigned NumValValues = ValValueVTs.size(); SmallVector Values(NumAggValues); // Ignore an insertvalue that produces an empty object if (!NumAggValues) { setValue(&I, DAG.getUNDEF(MVT(MVT::Other))); return; } SDValue Agg = getValue(Op0); unsigned i = 0; // Copy the beginning value(s) from the original aggregate. for (; i != LinearIndex; ++i) Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) : SDValue(Agg.getNode(), Agg.getResNo() + i); // Copy values from the inserted value(s). if (NumValValues) { SDValue Val = getValue(Op1); for (; i != LinearIndex + NumValValues; ++i) Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) : SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex); } // Copy remaining value(s) from the original aggregate. for (; i != NumAggValues; ++i) Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) : SDValue(Agg.getNode(), Agg.getResNo() + i); setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(AggValueVTs), Values)); } void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) { const Value *Op0 = I.getOperand(0); Type *AggTy = Op0->getType(); Type *ValTy = I.getType(); bool OutOfUndef = isa(Op0); unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector ValValueVTs; ComputeValueVTs(TLI, ValTy, ValValueVTs); unsigned NumValValues = ValValueVTs.size(); // Ignore a extractvalue that produces an empty object if (!NumValValues) { setValue(&I, DAG.getUNDEF(MVT(MVT::Other))); return; } SmallVector Values(NumValValues); SDValue Agg = getValue(Op0); // Copy out the selected value(s). for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i) Values[i - LinearIndex] = OutOfUndef ? DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) : SDValue(Agg.getNode(), Agg.getResNo() + i); setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(ValValueVTs), Values)); } void SelectionDAGBuilder::visitGetElementPtr(const User &I) { Value *Op0 = I.getOperand(0); // Note that the pointer operand may be a vector of pointers. Take the scalar // element which holds a pointer. Type *Ty = Op0->getType()->getScalarType(); unsigned AS = Ty->getPointerAddressSpace(); SDValue N = getValue(Op0); for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end(); OI != E; ++OI) { const Value *Idx = *OI; if (StructType *StTy = dyn_cast(Ty)) { unsigned Field = cast(Idx)->getUniqueInteger().getZExtValue(); if (Field) { // N = N + Offset uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N, DAG.getConstant(Offset, N.getValueType())); } Ty = StTy->getElementType(Field); } else { Ty = cast(Ty)->getElementType(); // If this is a constant subscript, handle it quickly. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (const ConstantInt *CI = dyn_cast(Idx)) { if (CI->isZero()) continue; uint64_t Offs = DL->getTypeAllocSize(Ty)*cast(CI)->getSExtValue(); SDValue OffsVal; EVT PTy = TLI.getPointerTy(AS); unsigned PtrBits = PTy.getSizeInBits(); if (PtrBits < 64) OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy, DAG.getConstant(Offs, MVT::i64)); else OffsVal = DAG.getConstant(Offs, PTy); N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N, OffsVal); continue; } // N = N + Idx * ElementSize; APInt ElementSize = APInt(TLI.getPointerSizeInBits(AS), DL->getTypeAllocSize(Ty)); SDValue IdxN = getValue(Idx); // If the index is smaller or larger than intptr_t, truncate or extend // it. IdxN = DAG.getSExtOrTrunc(IdxN, getCurSDLoc(), N.getValueType()); // If this is a multiply by a power of two, turn it into a shl // immediately. This is a very common case. if (ElementSize != 1) { if (ElementSize.isPowerOf2()) { unsigned Amt = ElementSize.logBase2(); IdxN = DAG.getNode(ISD::SHL, getCurSDLoc(), N.getValueType(), IdxN, DAG.getConstant(Amt, IdxN.getValueType())); } else { SDValue Scale = DAG.getConstant(ElementSize, IdxN.getValueType()); IdxN = DAG.getNode(ISD::MUL, getCurSDLoc(), N.getValueType(), IdxN, Scale); } } N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N, IdxN); } } setValue(&I, N); } void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { // If this is a fixed sized alloca in the entry block of the function, // allocate it statically on the stack. if (FuncInfo.StaticAllocaMap.count(&I)) return; // getValue will auto-populate this. Type *Ty = I.getAllocatedType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty); unsigned Align = std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty), I.getAlignment()); SDValue AllocSize = getValue(I.getArraySize()); EVT IntPtr = TLI.getPointerTy(); if (AllocSize.getValueType() != IntPtr) AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurSDLoc(), IntPtr); AllocSize = DAG.getNode(ISD::MUL, getCurSDLoc(), IntPtr, AllocSize, DAG.getConstant(TySize, IntPtr)); // Handle alignment. If the requested alignment is less than or equal to // the stack alignment, ignore it. If the size is greater than or equal to // the stack alignment, we note this in the DYNAMIC_STACKALLOC node. unsigned StackAlign = DAG.getSubtarget().getFrameLowering()->getStackAlignment(); if (Align <= StackAlign) Align = 0; // Round the size of the allocation up to the stack alignment size // by add SA-1 to the size. AllocSize = DAG.getNode(ISD::ADD, getCurSDLoc(), AllocSize.getValueType(), AllocSize, DAG.getIntPtrConstant(StackAlign-1)); // Mask out the low bits for alignment purposes. AllocSize = DAG.getNode(ISD::AND, getCurSDLoc(), AllocSize.getValueType(), AllocSize, DAG.getIntPtrConstant(~(uint64_t)(StackAlign-1))); SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align) }; SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other); SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurSDLoc(), VTs, Ops); setValue(&I, DSA); DAG.setRoot(DSA.getValue(1)); assert(FuncInfo.MF->getFrameInfo()->hasVarSizedObjects()); } void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (I.isAtomic()) return visitAtomicLoad(I); const Value *SV = I.getOperand(0); SDValue Ptr = getValue(SV); Type *Ty = I.getType(); bool isVolatile = I.isVolatile(); bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr; bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr; unsigned Alignment = I.getAlignment(); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector ValueVTs; SmallVector Offsets; ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; SDValue Root; bool ConstantMemory = false; if (isVolatile || NumValues > MaxParallelChains) // Serialize volatile loads with other side effects. Root = getRoot(); else if (AA->pointsToConstantMemory( AliasAnalysis::Location(SV, AA->getTypeStoreSize(Ty), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. Root = DAG.getEntryNode(); ConstantMemory = true; } else { // Do not serialize non-volatile loads against each other. Root = DAG.getRoot(); } if (isVolatile) Root = TLI.prepareVolatileOrAtomicLoad(Root, getCurSDLoc(), DAG); SmallVector Values(NumValues); SmallVector Chains(std::min(unsigned(MaxParallelChains), NumValues)); EVT PtrVT = Ptr.getValueType(); unsigned ChainI = 0; for (unsigned i = 0; i != NumValues; ++i, ++ChainI) { // Serializing loads here may result in excessive register pressure, and // TokenFactor places arbitrary choke points on the scheduler. SD scheduling // could recover a bit by hoisting nodes upward in the chain by recognizing // they are side-effect free or do not alias. The optimizer should really // avoid this case by converting large object/array copies to llvm.memcpy // (MaxParallelChains should always remain as failsafe). if (ChainI == MaxParallelChains) { assert(PendingLoads.empty() && "PendingLoads must be serialized first"); SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, makeArrayRef(Chains.data(), ChainI)); Root = Chain; ChainI = 0; } SDValue A = DAG.getNode(ISD::ADD, getCurSDLoc(), PtrVT, Ptr, DAG.getConstant(Offsets[i], PtrVT)); SDValue L = DAG.getLoad(ValueVTs[i], getCurSDLoc(), Root, A, MachinePointerInfo(SV, Offsets[i]), isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo, Ranges); Values[i] = L; Chains[ChainI] = L.getValue(1); } if (!ConstantMemory) { SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, makeArrayRef(Chains.data(), ChainI)); if (isVolatile) DAG.setRoot(Chain); else PendingLoads.push_back(Chain); } setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), DAG.getVTList(ValueVTs), Values)); } void SelectionDAGBuilder::visitStore(const StoreInst &I) { if (I.isAtomic()) return visitAtomicStore(I); const Value *SrcV = I.getOperand(0); const Value *PtrV = I.getOperand(1); SmallVector ValueVTs; SmallVector Offsets; ComputeValueVTs(DAG.getTargetLoweringInfo(), SrcV->getType(), ValueVTs, &Offsets); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; // Get the lowered operands. Note that we do this after // checking if NumResults is zero, because with zero results // the operands won't have values in the map. SDValue Src = getValue(SrcV); SDValue Ptr = getValue(PtrV); SDValue Root = getRoot(); SmallVector Chains(std::min(unsigned(MaxParallelChains), NumValues)); EVT PtrVT = Ptr.getValueType(); bool isVolatile = I.isVolatile(); bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr; unsigned Alignment = I.getAlignment(); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); unsigned ChainI = 0; for (unsigned i = 0; i != NumValues; ++i, ++ChainI) { // See visitLoad comments. if (ChainI == MaxParallelChains) { SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, makeArrayRef(Chains.data(), ChainI)); Root = Chain; ChainI = 0; } SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), PtrVT, Ptr, DAG.getConstant(Offsets[i], PtrVT)); SDValue St = DAG.getStore(Root, getCurSDLoc(), SDValue(Src.getNode(), Src.getResNo() + i), Add, MachinePointerInfo(PtrV, Offsets[i]), isVolatile, isNonTemporal, Alignment, AAInfo); Chains[ChainI] = St; } SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, makeArrayRef(Chains.data(), ChainI)); DAG.setRoot(StoreNode); } void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // llvm.masked.store.*(Src0, Ptr, alignemt, Mask) Value *PtrOperand = I.getArgOperand(1); SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(I.getArgOperand(0)); SDValue Mask = getValue(I.getArgOperand(3)); EVT VT = Src0.getValueType(); unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue(); if (!Alignment) Alignment = DAG.getEVTAlignment(VT); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, VT.getStoreSize(), Alignment, AAInfo); - SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO); + SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT, + MMO, false); DAG.setRoot(StoreNode); setValue(&I, StoreNode); } void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // @llvm.masked.load.*(Ptr, alignment, Mask, Src0) Value *PtrOperand = I.getArgOperand(0); SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(I.getArgOperand(3)); SDValue Mask = getValue(I.getArgOperand(2)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(I.getType()); unsigned Alignment = (cast(I.getArgOperand(1)))->getZExtValue(); if (!Alignment) Alignment = DAG.getEVTAlignment(VT); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); SDValue InChain = DAG.getRoot(); if (AA->pointsToConstantMemory( AliasAnalysis::Location(PtrOperand, AA->getTypeStoreSize(I.getType()), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. InChain = DAG.getEntryNode(); } MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, VT.getStoreSize(), Alignment, AAInfo, Ranges); - SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO); + SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO, + ISD::NON_EXTLOAD); SDValue OutChain = Load.getValue(1); DAG.setRoot(OutChain); setValue(&I, Load); } void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering SuccessOrder = I.getSuccessOrdering(); AtomicOrdering FailureOrder = I.getFailureOrdering(); SynchronizationScope Scope = I.getSynchScope(); SDValue InChain = getRoot(); MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType(); SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other); SDValue L = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain, getValue(I.getPointerOperand()), getValue(I.getCompareOperand()), getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()), /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope); SDValue OutChain = L.getValue(2); setValue(&I, L); DAG.setRoot(OutChain); } void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) { SDLoc dl = getCurSDLoc(); ISD::NodeType NT; switch (I.getOperation()) { default: llvm_unreachable("Unknown atomicrmw operation"); case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break; case AtomicRMWInst::Add: NT = ISD::ATOMIC_LOAD_ADD; break; case AtomicRMWInst::Sub: NT = ISD::ATOMIC_LOAD_SUB; break; case AtomicRMWInst::And: NT = ISD::ATOMIC_LOAD_AND; break; case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break; case AtomicRMWInst::Or: NT = ISD::ATOMIC_LOAD_OR; break; case AtomicRMWInst::Xor: NT = ISD::ATOMIC_LOAD_XOR; break; case AtomicRMWInst::Max: NT = ISD::ATOMIC_LOAD_MAX; break; case AtomicRMWInst::Min: NT = ISD::ATOMIC_LOAD_MIN; break; case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break; case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break; } AtomicOrdering Order = I.getOrdering(); SynchronizationScope Scope = I.getSynchScope(); SDValue InChain = getRoot(); SDValue L = DAG.getAtomic(NT, dl, getValue(I.getValOperand()).getSimpleValueType(), InChain, getValue(I.getPointerOperand()), getValue(I.getValOperand()), I.getPointerOperand(), /* Alignment=*/ 0, Order, Scope); SDValue OutChain = L.getValue(1); setValue(&I, L); DAG.setRoot(OutChain); } void SelectionDAGBuilder::visitFence(const FenceInst &I) { SDLoc dl = getCurSDLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Ops[3]; Ops[0] = getRoot(); Ops[1] = DAG.getConstant(I.getOrdering(), TLI.getPointerTy()); Ops[2] = DAG.getConstant(I.getSynchScope(), TLI.getPointerTy()); DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops)); } void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering Order = I.getOrdering(); SynchronizationScope Scope = I.getSynchScope(); SDValue InChain = getRoot(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(I.getType()); if (I.getAlignment() < VT.getSizeInBits() / 8) report_fatal_error("Cannot generate unaligned atomic load"); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, VT.getStoreSize(), I.getAlignment() ? I.getAlignment() : DAG.getEVTAlignment(VT)); InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG); SDValue L = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain, getValue(I.getPointerOperand()), MMO, Order, Scope); SDValue OutChain = L.getValue(1); setValue(&I, L); DAG.setRoot(OutChain); } void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering Order = I.getOrdering(); SynchronizationScope Scope = I.getSynchScope(); SDValue InChain = getRoot(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(I.getValueOperand()->getType()); if (I.getAlignment() < VT.getSizeInBits() / 8) report_fatal_error("Cannot generate unaligned atomic store"); SDValue OutChain = DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT, InChain, getValue(I.getPointerOperand()), getValue(I.getValueOperand()), I.getPointerOperand(), I.getAlignment(), Order, Scope); DAG.setRoot(OutChain); } /// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC /// node. void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic) { bool HasChain = !I.doesNotAccessMemory(); bool OnlyLoad = HasChain && I.onlyReadsMemory(); // Build the operand list. SmallVector Ops; if (HasChain) { // If this intrinsic has side-effects, chainify it. if (OnlyLoad) { // We don't need to serialize loads against other loads. Ops.push_back(DAG.getRoot()); } else { Ops.push_back(getRoot()); } } // Info is set by getTgtMemInstrinsic TargetLowering::IntrinsicInfo Info; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic); // Add the intrinsic ID as an integer operand if it's not a target intrinsic. if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID || Info.opc == ISD::INTRINSIC_W_CHAIN) Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI.getPointerTy())); // Add all operands of the call to the operand list. for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { SDValue Op = getValue(I.getArgOperand(i)); Ops.push_back(Op); } SmallVector ValueVTs; ComputeValueVTs(TLI, I.getType(), ValueVTs); if (HasChain) ValueVTs.push_back(MVT::Other); SDVTList VTs = DAG.getVTList(ValueVTs); // Create the node. SDValue Result; if (IsTgtIntrinsic) { // This is target intrinsic that touches memory Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT, MachinePointerInfo(Info.ptrVal, Info.offset), Info.align, Info.vol, Info.readMem, Info.writeMem, Info.size); } else if (!HasChain) { Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops); } else if (!I.getType()->isVoidTy()) { Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops); } else { Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops); } if (HasChain) { SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1); if (OnlyLoad) PendingLoads.push_back(Chain); else DAG.setRoot(Chain); } if (!I.getType()->isVoidTy()) { if (VectorType *PTy = dyn_cast(I.getType())) { EVT VT = TLI.getValueType(PTy); Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result); } setValue(&I, Result); } } /// GetSignificand - Get the significand and build it into a floating-point /// number with exponent of 1: /// /// Op = (Op & 0x007fffff) | 0x3f800000; /// /// where Op is the hexadecimal representation of floating point value. static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, SDLoc dl) { SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op, DAG.getConstant(0x007fffff, MVT::i32)); SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1, DAG.getConstant(0x3f800000, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2); } /// GetExponent - Get the exponent: /// /// (float)(int)(((Op & 0x7f800000) >> 23) - 127); /// /// where Op is the hexadecimal representation of floating point value. static SDValue GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI, SDLoc dl) { SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op, DAG.getConstant(0x7f800000, MVT::i32)); SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0, DAG.getConstant(23, TLI.getPointerTy())); SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1, DAG.getConstant(127, MVT::i32)); return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2); } /// getF32Constant - Get 32-bit floating point constant. static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt) { return DAG.getConstantFP(APFloat(APFloat::IEEEsingle, APInt(32, Flt)), MVT::f32); } /// expandExp - Lower an exp intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { // Put the exponent in the right bit position for later addition to the // final result: // // #define LOG2OFe 1.4426950f // IntegerPartOfX = ((int32_t)(X * LOG2OFe)); SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op, getF32Constant(DAG, 0x3fb8aa3b)); SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0); // FractionalPartOfX = (X * LOG2OFe) - (float)IntegerPartOfX; SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX); SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1); // IntegerPartOfX <<= 23; IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX, DAG.getConstant(23, TLI.getPointerTy())); SDValue TwoToFracPartOfX; if (LimitFloatPrecision <= 6) { // For floating-point precision of 6: // // TwoToFractionalPartOfX = // 0.997535578f + // (0.735607626f + 0.252464424f * x) * x; // // error 0.0144103317, which is 6 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3e814304)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f3c50c8)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); TwoToFracPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f7f5e7e)); } else if (LimitFloatPrecision <= 12) { // For floating-point precision of 12: // // TwoToFractionalPartOfX = // 0.999892986f + // (0.696457318f + // (0.224338339f + 0.792043434e-1f * x) * x) * x; // // 0.000107046256 error, which is 13 to 14 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3da235e3)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3e65b8f3)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f324b07)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); TwoToFracPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3f7ff8fd)); } else { // LimitFloatPrecision <= 18 // For floating-point precision of 18: // // TwoToFractionalPartOfX = // 0.999999982f + // (0.693148872f + // (0.240227044f + // (0.554906021e-1f + // (0.961591928e-2f + // (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x; // // error 2.47208000*10^(-7), which is better than 18 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3924b03e)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3ab24b87)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3c1d8c17)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3d634a1d)); SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X); SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8, getF32Constant(DAG, 0x3e75fe14)); SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X); SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10, getF32Constant(DAG, 0x3f317234)); SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X); TwoToFracPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12, getF32Constant(DAG, 0x3f800000)); } // Add the exponent into the result in integer domain. SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFracPartOfX); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX)); } // No special expansion. return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op); } /// expandLog - Lower a log intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); // Scale the exponent by log(2) [0.69314718f]. SDValue Exp = GetExponent(DAG, Op1, TLI, dl); SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp, getF32Constant(DAG, 0x3f317218)); // Get the significand and build it into a floating-point number with // exponent of 1. SDValue X = GetSignificand(DAG, Op1, dl); SDValue LogOfMantissa; if (LimitFloatPrecision <= 6) { // For floating-point precision of 6: // // LogofMantissa = // -1.1609546f + // (1.4034025f - 0.23903021f * x) * x; // // error 0.0034276066, which is better than 8 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbe74c456)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x3fb3a2b1)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f949a29)); } else if (LimitFloatPrecision <= 12) { // For floating-point precision of 12: // // LogOfMantissa = // -1.7417939f + // (2.8212026f + // (-1.4699568f + // (0.44717955f - 0.56570851e-1f * x) * x) * x) * x; // // error 0.000061011436, which is 14 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbd67b6d6)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x3ee4f4b8)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x3fbc278b)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x40348e95)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6, getF32Constant(DAG, 0x3fdef31a)); } else { // LimitFloatPrecision <= 18 // For floating-point precision of 18: // // LogOfMantissa = // -2.1072184f + // (4.2372794f + // (-3.7029485f + // (2.2781945f + // (-0.87823314f + // (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x; // // error 0.0000023660568, which is better than 18 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbc91e5ac)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x3e4350aa)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f60d3e3)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x4011cdf0)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6, getF32Constant(DAG, 0x406cfd1c)); SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X); SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8, getF32Constant(DAG, 0x408797cb)); SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X); LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10, getF32Constant(DAG, 0x4006dcab)); } return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa); } // No special expansion. return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op); } /// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); // Get the exponent. SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl); // Get the significand and build it into a floating-point number with // exponent of 1. SDValue X = GetSignificand(DAG, Op1, dl); // Different possible minimax approximations of significand in // floating-point for various degrees of accuracy over [1,2]. SDValue Log2ofMantissa; if (LimitFloatPrecision <= 6) { // For floating-point precision of 6: // // Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x; // // error 0.0049451742, which is more than 7 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbeb08fe0)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x40019463)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x3fd6633d)); } else if (LimitFloatPrecision <= 12) { // For floating-point precision of 12: // // Log2ofMantissa = // -2.51285454f + // (4.07009056f + // (-2.12067489f + // (.645142248f - 0.816157886e-1f * x) * x) * x) * x; // // error 0.0000876136000, which is better than 13 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbda7262e)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x3f25280b)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x4007b923)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x40823e2f)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6, getF32Constant(DAG, 0x4020d29c)); } else { // LimitFloatPrecision <= 18 // For floating-point precision of 18: // // Log2ofMantissa = // -3.0400495f + // (6.1129976f + // (-5.3420409f + // (3.2865683f + // (-1.2669343f + // (0.27515199f - // 0.25691327e-1f * x) * x) * x) * x) * x) * x; // // error 0.0000018516, which is better than 18 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbcd2769e)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x3e8ce0b9)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x3fa22ae7)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x40525723)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6, getF32Constant(DAG, 0x40aaf200)); SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X); SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8, getF32Constant(DAG, 0x40c39dad)); SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X); Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10, getF32Constant(DAG, 0x4042902c)); } return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa); } // No special expansion. return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op); } /// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandLog10(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); // Scale the exponent by log10(2) [0.30102999f]. SDValue Exp = GetExponent(DAG, Op1, TLI, dl); SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp, getF32Constant(DAG, 0x3e9a209a)); // Get the significand and build it into a floating-point number with // exponent of 1. SDValue X = GetSignificand(DAG, Op1, dl); SDValue Log10ofMantissa; if (LimitFloatPrecision <= 6) { // For floating-point precision of 6: // // Log10ofMantissa = // -0.50419619f + // (0.60948995f - 0.10380950f * x) * x; // // error 0.0014886165, which is 6 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0xbdd49a13)); SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0, getF32Constant(DAG, 0x3f1c0789)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f011300)); } else if (LimitFloatPrecision <= 12) { // For floating-point precision of 12: // // Log10ofMantissa = // -0.64831180f + // (0.91751397f + // (-0.31664806f + 0.47637168e-1f * x) * x) * x; // // error 0.00019228036, which is better than 12 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3d431f31)); SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, getF32Constant(DAG, 0x3ea21fb2)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f6ae232)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f25f7c3)); } else { // LimitFloatPrecision <= 18 // For floating-point precision of 18: // // Log10ofMantissa = // -0.84299375f + // (1.5327582f + // (-1.0688956f + // (0.49102474f + // (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x; // // error 0.0000037995730, which is better than 18 bits SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3c5d51ce)); SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, getF32Constant(DAG, 0x3e00685a)); SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3efb6798)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f88d192)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3fc4316c)); SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X); Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8, getF32Constant(DAG, 0x3f57ce70)); } return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa); } // No special expansion. return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op); } /// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandExp2(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op); // FractionalPartOfX = x - (float)IntegerPartOfX; SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX); SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, Op, t1); // IntegerPartOfX <<= 23; IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX, DAG.getConstant(23, TLI.getPointerTy())); SDValue TwoToFractionalPartOfX; if (LimitFloatPrecision <= 6) { // For floating-point precision of 6: // // TwoToFractionalPartOfX = // 0.997535578f + // (0.735607626f + 0.252464424f * x) * x; // // error 0.0144103317, which is 6 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3e814304)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f3c50c8)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f7f5e7e)); } else if (LimitFloatPrecision <= 12) { // For floating-point precision of 12: // // TwoToFractionalPartOfX = // 0.999892986f + // (0.696457318f + // (0.224338339f + 0.792043434e-1f * x) * x) * x; // // error 0.000107046256, which is 13 to 14 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3da235e3)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3e65b8f3)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f324b07)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3f7ff8fd)); } else { // LimitFloatPrecision <= 18 // For floating-point precision of 18: // // TwoToFractionalPartOfX = // 0.999999982f + // (0.693148872f + // (0.240227044f + // (0.554906021e-1f + // (0.961591928e-2f + // (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x; // error 2.47208000*10^(-7), which is better than 18 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3924b03e)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3ab24b87)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3c1d8c17)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3d634a1d)); SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X); SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8, getF32Constant(DAG, 0x3e75fe14)); SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X); SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10, getF32Constant(DAG, 0x3f317234)); SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X); TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12, getF32Constant(DAG, 0x3f800000)); } // Add the exponent into the result in integer domain. SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX)); } // No special expansion. return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op); } /// visitPow - Lower a pow intrinsic. Handles the special sequences for /// limited-precision mode with x == 10.0f. static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const TargetLowering &TLI) { bool IsExp10 = false; if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { if (ConstantFPSDNode *LHSC = dyn_cast(LHS)) { APFloat Ten(10.0f); IsExp10 = LHSC->isExactlyValue(Ten); } } if (IsExp10) { // Put the exponent in the right bit position for later addition to the // final result: // // #define LOG2OF10 3.3219281f // IntegerPartOfX = (int32_t)(x * LOG2OF10); SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS, getF32Constant(DAG, 0x40549a78)); SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0); // FractionalPartOfX = x - (float)IntegerPartOfX; SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX); SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1); // IntegerPartOfX <<= 23; IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX, DAG.getConstant(23, TLI.getPointerTy())); SDValue TwoToFractionalPartOfX; if (LimitFloatPrecision <= 6) { // For floating-point precision of 6: // // twoToFractionalPartOfX = // 0.997535578f + // (0.735607626f + 0.252464424f * x) * x; // // error 0.0144103317, which is 6 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3e814304)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3f3c50c8)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f7f5e7e)); } else if (LimitFloatPrecision <= 12) { // For floating-point precision of 12: // // TwoToFractionalPartOfX = // 0.999892986f + // (0.696457318f + // (0.224338339f + 0.792043434e-1f * x) * x) * x; // // error 0.000107046256, which is 13 to 14 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3da235e3)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3e65b8f3)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3f324b07)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3f7ff8fd)); } else { // LimitFloatPrecision <= 18 // For floating-point precision of 18: // // TwoToFractionalPartOfX = // 0.999999982f + // (0.693148872f + // (0.240227044f + // (0.554906021e-1f + // (0.961591928e-2f + // (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x; // error 2.47208000*10^(-7), which is better than 18 bits SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X, getF32Constant(DAG, 0x3924b03e)); SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2, getF32Constant(DAG, 0x3ab24b87)); SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X); SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4, getF32Constant(DAG, 0x3c1d8c17)); SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X); SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6, getF32Constant(DAG, 0x3d634a1d)); SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X); SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8, getF32Constant(DAG, 0x3e75fe14)); SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X); SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10, getF32Constant(DAG, 0x3f317234)); SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X); TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12, getF32Constant(DAG, 0x3f800000)); } SDValue t13 = DAG.getNode(ISD::BITCAST, dl,MVT::i32,TwoToFractionalPartOfX); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX)); } // No special expansion. return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS); } /// ExpandPowI - Expand a llvm.powi intrinsic. static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS, SelectionDAG &DAG) { // If RHS is a constant, we can expand this out to a multiplication tree, // otherwise we end up lowering to a call to __powidf2 (for example). When // optimizing for size, we only want to do this if the expansion would produce // a small number of multiplies, otherwise we do the full expansion. if (ConstantSDNode *RHSC = dyn_cast(RHS)) { // Get the exponent as a positive value. unsigned Val = RHSC->getSExtValue(); if ((int)Val < 0) Val = -Val; // powi(x, 0) -> 1.0 if (Val == 0) return DAG.getConstantFP(1.0, LHS.getValueType()); const Function *F = DAG.getMachineFunction().getFunction(); if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) || // If optimizing for size, don't insert too many multiplies. This // inserts up to 5 multiplies. CountPopulation_32(Val)+Log2_32(Val) < 7) { // We use the simple binary decomposition method to generate the multiply // sequence. There are more optimal ways to do this (for example, // powi(x,15) generates one more multiply than it should), but this has // the benefit of being both really simple and much better than a libcall. SDValue Res; // Logically starts equal to 1.0 SDValue CurSquare = LHS; while (Val) { if (Val & 1) { if (Res.getNode()) Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare); else Res = CurSquare; // 1.0*CurSquare. } CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(), CurSquare, CurSquare); Val >>= 1; } // If the original was negative, invert the result, producing 1/(x*x*x). if (RHSC->getSExtValue() < 0) Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(), DAG.getConstantFP(1.0, LHS.getValueType()), Res); return Res; } } // Otherwise, expand to a libcall. return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS); } // getTruncatedArgReg - Find underlying register used for an truncated // argument. static unsigned getTruncatedArgReg(const SDValue &N) { if (N.getOpcode() != ISD::TRUNCATE) return 0; const SDValue &Ext = N.getOperand(0); if (Ext.getOpcode() == ISD::AssertZext || Ext.getOpcode() == ISD::AssertSext) { const SDValue &CFR = Ext.getOperand(0); if (CFR.getOpcode() == ISD::CopyFromReg) return cast(CFR.getOperand(1))->getReg(); if (CFR.getOpcode() == ISD::TRUNCATE) return getTruncatedArgReg(CFR); } return 0; } /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function /// argument, create the corresponding DBG_VALUE machine instruction for it now. /// At the end of instruction selection, they will be inserted to the entry BB. bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable, MDNode *Expr, int64_t Offset, bool IsIndirect, const SDValue &N) { const Argument *Arg = dyn_cast(V); if (!Arg) return false; MachineFunction &MF = DAG.getMachineFunction(); const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo(); // Ignore inlined function arguments here. DIVariable DV(Variable); if (DV.isInlinedFnArgument(MF.getFunction())) return false; Optional Op; // Some arguments' frame index is recorded during argument lowering. if (int FI = FuncInfo.getArgumentFrameIndex(Arg)) Op = MachineOperand::CreateFI(FI); if (!Op && N.getNode()) { unsigned Reg; if (N.getOpcode() == ISD::CopyFromReg) Reg = cast(N.getOperand(1))->getReg(); else Reg = getTruncatedArgReg(N); if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) { MachineRegisterInfo &RegInfo = MF.getRegInfo(); unsigned PR = RegInfo.getLiveInPhysReg(Reg); if (PR) Reg = PR; } if (Reg) Op = MachineOperand::CreateReg(Reg, false); } if (!Op) { // Check if ValueMap has reg number. DenseMap::iterator VMI = FuncInfo.ValueMap.find(V); if (VMI != FuncInfo.ValueMap.end()) Op = MachineOperand::CreateReg(VMI->second, false); } if (!Op && N.getNode()) // Check if frame index is available. if (LoadSDNode *LNode = dyn_cast(N.getNode())) if (FrameIndexSDNode *FINode = dyn_cast(LNode->getBasePtr().getNode())) Op = MachineOperand::CreateFI(FINode->getIndex()); if (!Op) return false; if (Op->isReg()) FuncInfo.ArgDbgValues.push_back( BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE), IsIndirect, Op->getReg(), Offset, Variable, Expr)); else FuncInfo.ArgDbgValues.push_back( BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE)) .addOperand(*Op) .addImm(Offset) .addMetadata(Variable) .addMetadata(Expr)); return true; } // VisualStudio defines setjmp as _setjmp #if defined(_MSC_VER) && defined(setjmp) && \ !defined(setjmp_undefined_for_msvc) # pragma push_macro("setjmp") # undef setjmp # define setjmp_undefined_for_msvc #endif /// visitIntrinsicCall - Lower the call to the specified intrinsic function. If /// we want to emit this as a call to a named external function, return the name /// otherwise lower it and return null. const char * SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc sdl = getCurSDLoc(); DebugLoc dl = getCurDebugLoc(); SDValue Res; switch (Intrinsic) { default: // By default, turn this into a target intrinsic node. visitTargetIntrinsic(I, Intrinsic); return nullptr; case Intrinsic::vastart: visitVAStart(I); return nullptr; case Intrinsic::vaend: visitVAEnd(I); return nullptr; case Intrinsic::vacopy: visitVACopy(I); return nullptr; case Intrinsic::returnaddress: setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI.getPointerTy(), getValue(I.getArgOperand(0)))); return nullptr; case Intrinsic::frameaddress: setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(), getValue(I.getArgOperand(0)))); return nullptr; case Intrinsic::read_register: { Value *Reg = I.getArgOperand(0); SDValue RegName = DAG.getMDNode(cast(cast(Reg)->getMetadata())); EVT VT = TLI.getValueType(I.getType()); setValue(&I, DAG.getNode(ISD::READ_REGISTER, sdl, VT, RegName)); return nullptr; } case Intrinsic::write_register: { Value *Reg = I.getArgOperand(0); Value *RegValue = I.getArgOperand(1); SDValue Chain = getValue(RegValue).getOperand(0); SDValue RegName = DAG.getMDNode(cast(cast(Reg)->getMetadata())); DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain, RegName, getValue(RegValue))); return nullptr; } case Intrinsic::setjmp: return &"_setjmp"[!TLI.usesUnderscoreSetJmp()]; case Intrinsic::longjmp: return &"_longjmp"[!TLI.usesUnderscoreLongJmp()]; case Intrinsic::memcpy: { // Assert for address < 256 since we support only user defined address // spaces. assert(cast(I.getArgOperand(0)->getType())->getAddressSpace() < 256 && cast(I.getArgOperand(1)->getType())->getAddressSpace() < 256 && "Unknown address space"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); unsigned Align = cast(I.getArgOperand(3))->getZExtValue(); if (!Align) Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment. bool isVol = cast(I.getArgOperand(4))->getZExtValue(); DAG.setRoot(DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, false, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)))); return nullptr; } case Intrinsic::memset: { // Assert for address < 256 since we support only user defined address // spaces. assert(cast(I.getArgOperand(0)->getType())->getAddressSpace() < 256 && "Unknown address space"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); unsigned Align = cast(I.getArgOperand(3))->getZExtValue(); if (!Align) Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment. bool isVol = cast(I.getArgOperand(4))->getZExtValue(); DAG.setRoot(DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, MachinePointerInfo(I.getArgOperand(0)))); return nullptr; } case Intrinsic::memmove: { // Assert for address < 256 since we support only user defined address // spaces. assert(cast(I.getArgOperand(0)->getType())->getAddressSpace() < 256 && cast(I.getArgOperand(1)->getType())->getAddressSpace() < 256 && "Unknown address space"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); unsigned Align = cast(I.getArgOperand(3))->getZExtValue(); if (!Align) Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment. bool isVol = cast(I.getArgOperand(4))->getZExtValue(); DAG.setRoot(DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)))); return nullptr; } case Intrinsic::dbg_declare: { const DbgDeclareInst &DI = cast(I); MDNode *Variable = DI.getVariable(); MDNode *Expression = DI.getExpression(); const Value *Address = DI.getAddress(); DIVariable DIVar(Variable); assert((!DIVar || DIVar.isVariable()) && "Variable in DbgDeclareInst should be either null or a DIVariable."); if (!Address || !DIVar) { DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); return nullptr; } // Check if address has undef value. if (isa(Address) || (Address->use_empty() && !isa(Address))) { DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); return nullptr; } SDValue &N = NodeMap[Address]; if (!N.getNode() && isa(Address)) // Check unused arguments map. N = UnusedArgNodeMap[Address]; SDDbgValue *SDV; if (N.getNode()) { if (const BitCastInst *BCI = dyn_cast(Address)) Address = BCI->getOperand(0); // Parameters are handled specially. bool isParameter = (DIVariable(Variable).getTag() == dwarf::DW_TAG_arg_variable || isa(Address)); const AllocaInst *AI = dyn_cast(Address); if (isParameter && !AI) { FrameIndexSDNode *FINode = dyn_cast(N.getNode()); if (FINode) // Byval parameter. We have a frame index at this point. SDV = DAG.getFrameIndexDbgValue( Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder); else { // Address is an argument, so try to emit its dbg value using // virtual register info from the FuncInfo.ValueMap. EmitFuncArgumentDbgValue(Address, Variable, Expression, 0, false, N); return nullptr; } } else if (AI) SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), true, 0, dl, SDNodeOrder); else { // Can't do anything with other non-AI cases yet. DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); DEBUG(dbgs() << "non-AllocaInst issue for Address: \n\t"); DEBUG(Address->dump()); return nullptr; } DAG.AddDbgValue(SDV, N.getNode(), isParameter); } else { // If Address is an argument then try to emit its dbg value using // virtual register info from the FuncInfo.ValueMap. if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, 0, false, N)) { // If variable is pinned by a alloca in dominating bb then // use StaticAllocaMap. if (const AllocaInst *AI = dyn_cast(Address)) { if (AI->getParent() != DI.getParent()) { DenseMap::iterator SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second, 0, dl, SDNodeOrder); DAG.AddDbgValue(SDV, nullptr, false); return nullptr; } } } DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); } } return nullptr; } case Intrinsic::dbg_value: { const DbgValueInst &DI = cast(I); DIVariable DIVar(DI.getVariable()); assert((!DIVar || DIVar.isVariable()) && "Variable in DbgValueInst should be either null or a DIVariable."); if (!DIVar) return nullptr; MDNode *Variable = DI.getVariable(); MDNode *Expression = DI.getExpression(); uint64_t Offset = DI.getOffset(); const Value *V = DI.getValue(); if (!V) return nullptr; SDDbgValue *SDV; if (isa(V) || isa(V) || isa(V)) { SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl, SDNodeOrder); DAG.AddDbgValue(SDV, nullptr, false); } else { // Do not use getValue() in here; we don't want to generate code at // this point if it hasn't been done yet. SDValue N = NodeMap[V]; if (!N.getNode() && isa(V)) // Check unused arguments map. N = UnusedArgNodeMap[V]; if (N.getNode()) { // A dbg.value for an alloca is always indirect. bool IsIndirect = isa(V) || Offset != 0; if (!EmitFuncArgumentDbgValue(V, Variable, Expression, Offset, IsIndirect, N)) { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), IsIndirect, Offset, dl, SDNodeOrder); DAG.AddDbgValue(SDV, N.getNode(), false); } } else if (!V->use_empty() ) { // Do not call getValue(V) yet, as we don't want to generate code. // Remember it for later. DanglingDebugInfo DDI(&DI, dl, SDNodeOrder); DanglingDebugInfoMap[V] = DDI; } else { // We may expand this to cover more cases. One case where we have no // data available is an unreferenced parameter. DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); } } // Build a debug info table entry. if (const BitCastInst *BCI = dyn_cast(V)) V = BCI->getOperand(0); const AllocaInst *AI = dyn_cast(V); // Don't handle byval struct arguments or VLAs, for example. if (!AI) { DEBUG(dbgs() << "Dropping debug location info for:\n " << DI << "\n"); DEBUG(dbgs() << " Last seen at:\n " << *V << "\n"); return nullptr; } DenseMap::iterator SI = FuncInfo.StaticAllocaMap.find(AI); if (SI == FuncInfo.StaticAllocaMap.end()) return nullptr; // VLAs. return nullptr; } case Intrinsic::eh_typeid_for: { // Find the type id for the given typeinfo. GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0)); unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV); Res = DAG.getConstant(TypeID, MVT::i32); setValue(&I, Res); return nullptr; } case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64: DAG.getMachineFunction().getMMI().setCallsEHReturn(true); DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl, MVT::Other, getControlRoot(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; case Intrinsic::eh_unwind_init: DAG.getMachineFunction().getMMI().setCallsUnwindInit(true); return nullptr; case Intrinsic::eh_dwarf_cfa: { SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl, TLI.getPointerTy()); SDValue Offset = DAG.getNode(ISD::ADD, sdl, CfaArg.getValueType(), DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl, CfaArg.getValueType()), CfaArg); SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(), DAG.getConstant(0, TLI.getPointerTy())); setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(), FA, Offset)); return nullptr; } case Intrinsic::eh_sjlj_callsite: { MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI(); ConstantInt *CI = dyn_cast(I.getArgOperand(0)); assert(CI && "Non-constant call site value in eh.sjlj.callsite!"); assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!"); MMI.setCurrentCallSite(CI->getZExtValue()); return nullptr; } case Intrinsic::eh_sjlj_functioncontext: { // Get and store the index of the function context. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); AllocaInst *FnCtx = cast(I.getArgOperand(0)->stripPointerCasts()); int FI = FuncInfo.StaticAllocaMap[FnCtx]; MFI->setFunctionContextIndex(FI); return nullptr; } case Intrinsic::eh_sjlj_setjmp: { SDValue Ops[2]; Ops[0] = getRoot(); Ops[1] = getValue(I.getArgOperand(0)); SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl, DAG.getVTList(MVT::i32, MVT::Other), Ops); setValue(&I, Op.getValue(0)); DAG.setRoot(Op.getValue(1)); return nullptr; } case Intrinsic::eh_sjlj_longjmp: { DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other, getRoot(), getValue(I.getArgOperand(0)))); return nullptr; } case Intrinsic::masked_load: visitMaskedLoad(I); return nullptr; case Intrinsic::masked_store: visitMaskedStore(I); return nullptr; case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: case Intrinsic::x86_mmx_psrli_w: case Intrinsic::x86_mmx_psrli_d: case Intrinsic::x86_mmx_psrli_q: case Intrinsic::x86_mmx_psrai_w: case Intrinsic::x86_mmx_psrai_d: { SDValue ShAmt = getValue(I.getArgOperand(1)); if (isa(ShAmt)) { visitTargetIntrinsic(I, Intrinsic); return nullptr; } unsigned NewIntrinsic = 0; EVT ShAmtVT = MVT::v2i32; switch (Intrinsic) { case Intrinsic::x86_mmx_pslli_w: NewIntrinsic = Intrinsic::x86_mmx_psll_w; break; case Intrinsic::x86_mmx_pslli_d: NewIntrinsic = Intrinsic::x86_mmx_psll_d; break; case Intrinsic::x86_mmx_pslli_q: NewIntrinsic = Intrinsic::x86_mmx_psll_q; break; case Intrinsic::x86_mmx_psrli_w: NewIntrinsic = Intrinsic::x86_mmx_psrl_w; break; case Intrinsic::x86_mmx_psrli_d: NewIntrinsic = Intrinsic::x86_mmx_psrl_d; break; case Intrinsic::x86_mmx_psrli_q: NewIntrinsic = Intrinsic::x86_mmx_psrl_q; break; case Intrinsic::x86_mmx_psrai_w: NewIntrinsic = Intrinsic::x86_mmx_psra_w; break; case Intrinsic::x86_mmx_psrai_d: NewIntrinsic = Intrinsic::x86_mmx_psra_d; break; default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. } // The vector shift intrinsics with scalars uses 32b shift amounts but // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits // to be zero. // We must do this early because v2i32 is not a legal type. SDValue ShOps[2]; ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, MVT::i32); ShAmt = DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps); EVT DestVT = TLI.getValueType(I.getType()); ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt); Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT, DAG.getConstant(NewIntrinsic, MVT::i32), getValue(I.getArgOperand(0)), ShAmt); setValue(&I, Res); return nullptr; } case Intrinsic::x86_avx_vinsertf128_pd_256: case Intrinsic::x86_avx_vinsertf128_ps_256: case Intrinsic::x86_avx_vinsertf128_si_256: case Intrinsic::x86_avx2_vinserti128: { EVT DestVT = TLI.getValueType(I.getType()); EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType()); uint64_t Idx = (cast(I.getArgOperand(2))->getZExtValue() & 1) * ElVT.getVectorNumElements(); Res = DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, DestVT, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), DAG.getConstant(Idx, TLI.getVectorIdxTy())); setValue(&I, Res); return nullptr; } case Intrinsic::x86_avx_vextractf128_pd_256: case Intrinsic::x86_avx_vextractf128_ps_256: case Intrinsic::x86_avx_vextractf128_si_256: case Intrinsic::x86_avx2_vextracti128: { EVT DestVT = TLI.getValueType(I.getType()); uint64_t Idx = (cast(I.getArgOperand(1))->getZExtValue() & 1) * DestVT.getVectorNumElements(); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, DestVT, getValue(I.getArgOperand(0)), DAG.getConstant(Idx, TLI.getVectorIdxTy())); setValue(&I, Res); return nullptr; } case Intrinsic::convertff: case Intrinsic::convertfsi: case Intrinsic::convertfui: case Intrinsic::convertsif: case Intrinsic::convertuif: case Intrinsic::convertss: case Intrinsic::convertsu: case Intrinsic::convertus: case Intrinsic::convertuu: { ISD::CvtCode Code = ISD::CVT_INVALID; switch (Intrinsic) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::convertff: Code = ISD::CVT_FF; break; case Intrinsic::convertfsi: Code = ISD::CVT_FS; break; case Intrinsic::convertfui: Code = ISD::CVT_FU; break; case Intrinsic::convertsif: Code = ISD::CVT_SF; break; case Intrinsic::convertuif: Code = ISD::CVT_UF; break; case Intrinsic::convertss: Code = ISD::CVT_SS; break; case Intrinsic::convertsu: Code = ISD::CVT_SU; break; case Intrinsic::convertus: Code = ISD::CVT_US; break; case Intrinsic::convertuu: Code = ISD::CVT_UU; break; } EVT DestVT = TLI.getValueType(I.getType()); const Value *Op1 = I.getArgOperand(0); Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1), DAG.getValueType(DestVT), DAG.getValueType(getValue(Op1).getValueType()), getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)), Code); setValue(&I, Res); return nullptr; } case Intrinsic::powi: setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), DAG)); return nullptr; case Intrinsic::log: setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); return nullptr; case Intrinsic::log2: setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); return nullptr; case Intrinsic::log10: setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); return nullptr; case Intrinsic::exp: setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); return nullptr; case Intrinsic::exp2: setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); return nullptr; case Intrinsic::pow: setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), DAG, TLI)); return nullptr; case Intrinsic::sqrt: case Intrinsic::fabs: case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: case Intrinsic::rint: case Intrinsic::nearbyint: case Intrinsic::round: { unsigned Opcode; switch (Intrinsic) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; case Intrinsic::fabs: Opcode = ISD::FABS; break; case Intrinsic::sin: Opcode = ISD::FSIN; break; case Intrinsic::cos: Opcode = ISD::FCOS; break; case Intrinsic::floor: Opcode = ISD::FFLOOR; break; case Intrinsic::ceil: Opcode = ISD::FCEIL; break; case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; case Intrinsic::rint: Opcode = ISD::FRINT; break; case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; case Intrinsic::round: Opcode = ISD::FROUND; break; } setValue(&I, DAG.getNode(Opcode, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)))); return nullptr; } case Intrinsic::minnum: setValue(&I, DAG.getNode(ISD::FMINNUM, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; case Intrinsic::maxnum: setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; case Intrinsic::copysign: setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)))); return nullptr; case Intrinsic::fma: setValue(&I, DAG.getNode(ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return nullptr; case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(I.getType()); if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && TLI.isFMAFasterThanFMulAndFAdd(VT)) { setValue(&I, DAG.getNode(ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); } else { SDValue Mul = DAG.getNode(ISD::FMUL, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1))); SDValue Add = DAG.getNode(ISD::FADD, sdl, getValue(I.getArgOperand(0)).getValueType(), Mul, getValue(I.getArgOperand(2))); setValue(&I, Add); } return nullptr; } case Intrinsic::convert_to_fp16: setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16, DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16, getValue(I.getArgOperand(0)), DAG.getTargetConstant(0, MVT::i32)))); return nullptr; case Intrinsic::convert_from_fp16: setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl, TLI.getValueType(I.getType()), DAG.getNode(ISD::BITCAST, sdl, MVT::f16, getValue(I.getArgOperand(0))))); return nullptr; case Intrinsic::pcmarker: { SDValue Tmp = getValue(I.getArgOperand(0)); DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp)); return nullptr; } case Intrinsic::readcyclecounter: { SDValue Op = getRoot(); Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl, DAG.getVTList(MVT::i64, MVT::Other), Op); setValue(&I, Res); DAG.setRoot(Res.getValue(1)); return nullptr; } case Intrinsic::bswap: setValue(&I, DAG.getNode(ISD::BSWAP, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)))); return nullptr; case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); EVT Ty = Arg.getValueType(); setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF, sdl, Ty, Arg)); return nullptr; } case Intrinsic::ctlz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); EVT Ty = Arg.getValueType(); setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF, sdl, Ty, Arg)); return nullptr; } case Intrinsic::ctpop: { SDValue Arg = getValue(I.getArgOperand(0)); EVT Ty = Arg.getValueType(); setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg)); return nullptr; } case Intrinsic::stacksave: { SDValue Op = getRoot(); Res = DAG.getNode(ISD::STACKSAVE, sdl, DAG.getVTList(TLI.getPointerTy(), MVT::Other), Op); setValue(&I, Res); DAG.setRoot(Res.getValue(1)); return nullptr; } case Intrinsic::stackrestore: { Res = getValue(I.getArgOperand(0)); DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res)); return nullptr; } case Intrinsic::stackprotector: { // Emit code into the DAG to store the stack guard onto the stack. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); EVT PtrTy = TLI.getPointerTy(); SDValue Src, Chain = getRoot(); const Value *Ptr = cast(I.getArgOperand(0))->getPointerOperand(); const GlobalVariable *GV = dyn_cast(Ptr); // See if Ptr is a bitcast. If it is, look through it and see if we can get // global variable __stack_chk_guard. if (!GV) if (const Operator *BC = dyn_cast(Ptr)) if (BC->getOpcode() == Instruction::BitCast) GV = dyn_cast(BC->getOperand(0)); if (GV && TLI.useLoadStackGuardNode()) { // Emit a LOAD_STACK_GUARD node. MachineSDNode *Node = DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, sdl, PtrTy, Chain); MachinePointerInfo MPInfo(GV); MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1); unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; *MemRefs = MF.getMachineMemOperand(MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlignment(PtrTy)); Node->setMemRefs(MemRefs, MemRefs + 1); // Copy the guard value to a virtual register so that it can be // retrieved in the epilogue. Src = SDValue(Node, 0); const TargetRegisterClass *RC = TLI.getRegClassFor(Src.getSimpleValueType()); unsigned Reg = MF.getRegInfo().createVirtualRegister(RC); SPDescriptor.setGuardReg(Reg); Chain = DAG.getCopyToReg(Chain, sdl, Reg, Src); } else { Src = getValue(I.getArgOperand(0)); // The guard's value. } AllocaInst *Slot = cast(I.getArgOperand(1)); int FI = FuncInfo.StaticAllocaMap[Slot]; MFI->setStackProtectorIndex(FI); SDValue FIN = DAG.getFrameIndex(FI, PtrTy); // Store the stack protector onto the stack. Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(FI), true, false, 0); setValue(&I, Res); DAG.setRoot(Res); return nullptr; } case Intrinsic::objectsize: { // If we don't know by now, we're never going to know. ConstantInt *CI = dyn_cast(I.getArgOperand(1)); assert(CI && "Non-constant type in __builtin_object_size?"); SDValue Arg = getValue(I.getCalledValue()); EVT Ty = Arg.getValueType(); if (CI->isZero()) Res = DAG.getConstant(-1ULL, Ty); else Res = DAG.getConstant(0, Ty); setValue(&I, Res); return nullptr; } case Intrinsic::annotation: case Intrinsic::ptr_annotation: // Drop the intrinsic, but forward the value setValue(&I, getValue(I.getOperand(0))); return nullptr; case Intrinsic::assume: case Intrinsic::var_annotation: // Discard annotate attributes and assumptions return nullptr; case Intrinsic::init_trampoline: { const Function *F = cast(I.getArgOperand(1)->stripPointerCasts()); SDValue Ops[6]; Ops[0] = getRoot(); Ops[1] = getValue(I.getArgOperand(0)); Ops[2] = getValue(I.getArgOperand(1)); Ops[3] = getValue(I.getArgOperand(2)); Ops[4] = DAG.getSrcValue(I.getArgOperand(0)); Ops[5] = DAG.getSrcValue(F); Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops); DAG.setRoot(Res); return nullptr; } case Intrinsic::adjust_trampoline: { setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl, TLI.getPointerTy(), getValue(I.getArgOperand(0)))); return nullptr; } case Intrinsic::gcroot: if (GFI) { const Value *Alloca = I.getArgOperand(0)->stripPointerCasts(); const Constant *TypeMap = cast(I.getArgOperand(1)); FrameIndexSDNode *FI = cast(getValue(Alloca).getNode()); GFI->addStackRoot(FI->getIndex(), TypeMap); } return nullptr; case Intrinsic::gcread: case Intrinsic::gcwrite: llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!"); case Intrinsic::flt_rounds: setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32)); return nullptr; case Intrinsic::expect: { // Just replace __builtin_expect(exp, c) with EXP. setValue(&I, getValue(I.getArgOperand(0))); return nullptr; } case Intrinsic::debugtrap: case Intrinsic::trap: { StringRef TrapFuncName = TM.Options.getTrapFunctionName(); if (TrapFuncName.empty()) { ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? ISD::TRAP : ISD::DEBUGTRAP; DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot())); return nullptr; } TargetLowering::ArgListTy Args; TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(sdl).setChain(getRoot()) .setCallee(CallingConv::C, I.getType(), DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()), std::move(Args), 0); std::pair Result = TLI.LowerCallTo(CLI); DAG.setRoot(Result.second); return nullptr; } case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: { ISD::NodeType Op; switch (Intrinsic) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break; case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break; case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break; case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break; case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break; case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break; } SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1); setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2)); return nullptr; } case Intrinsic::prefetch: { SDValue Ops[5]; unsigned rw = cast(I.getArgOperand(1))->getZExtValue(); Ops[0] = getRoot(); Ops[1] = getValue(I.getArgOperand(0)); Ops[2] = getValue(I.getArgOperand(1)); Ops[3] = getValue(I.getArgOperand(2)); Ops[4] = getValue(I.getArgOperand(3)); DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl, DAG.getVTList(MVT::Other), Ops, EVT::getIntegerVT(*Context, 8), MachinePointerInfo(I.getArgOperand(0)), 0, /* align */ false, /* volatile */ rw==0, /* read */ rw==1)); /* write */ return nullptr; } case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { bool IsStart = (Intrinsic == Intrinsic::lifetime_start); // Stack coloring is not enabled in O0, discard region information. if (TM.getOptLevel() == CodeGenOpt::None) return nullptr; SmallVector Allocas; GetUnderlyingObjects(I.getArgOperand(1), Allocas, DL); for (SmallVectorImpl::iterator Object = Allocas.begin(), E = Allocas.end(); Object != E; ++Object) { AllocaInst *LifetimeObject = dyn_cast_or_null(*Object); // Could not find an Alloca. if (!LifetimeObject) continue; // First check that the Alloca is static, otherwise it won't have a // valid frame index. auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject); if (SI == FuncInfo.StaticAllocaMap.end()) return nullptr; int FI = SI->second; SDValue Ops[2]; Ops[0] = getRoot(); Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true); unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END); Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops); DAG.setRoot(Res); } return nullptr; } case Intrinsic::invariant_start: // Discard region information. setValue(&I, DAG.getUNDEF(TLI.getPointerTy())); return nullptr; case Intrinsic::invariant_end: // Discard region information. return nullptr; case Intrinsic::stackprotectorcheck: { // Do not actually emit anything for this basic block. Instead we initialize // the stack protector descriptor and export the guard variable so we can // access it in FinishBasicBlock. const BasicBlock *BB = I.getParent(); SPDescriptor.initialize(BB, FuncInfo.MBBMap[BB], I); ExportFromCurrentBlock(SPDescriptor.getGuard()); // Flush our exports since we are going to process a terminator. (void)getControlRoot(); return nullptr; } case Intrinsic::clear_cache: return TLI.getClearCacheBuiltinName(); case Intrinsic::donothing: // ignore return nullptr; case Intrinsic::experimental_stackmap: { visitStackmap(I); return nullptr; } case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: { visitPatchpoint(&I); return nullptr; } case Intrinsic::experimental_gc_statepoint: { visitStatepoint(I); return nullptr; } case Intrinsic::experimental_gc_result_int: case Intrinsic::experimental_gc_result_float: case Intrinsic::experimental_gc_result_ptr: { visitGCResult(I); return nullptr; } case Intrinsic::experimental_gc_relocate: { visitGCRelocate(I); return nullptr; } case Intrinsic::instrprof_increment: llvm_unreachable("instrprof failed to lower an increment"); case Intrinsic::frameallocate: { MachineFunction &MF = DAG.getMachineFunction(); const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo(); // Do the allocation and map it as a normal value. // FIXME: Maybe we should add this to the alloca map so that we don't have // to register allocate it? uint64_t Size = cast(I.getArgOperand(0))->getZExtValue(); int Alloc = MF.getFrameInfo()->CreateFrameAllocation(Size); MVT PtrVT = TLI.getPointerTy(0); SDValue FIVal = DAG.getFrameIndex(Alloc, PtrVT); setValue(&I, FIVal); // Directly emit a FRAME_ALLOC machine instr. Label assignment emission is // the same on all targets. MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol(MF.getName()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl, TII->get(TargetOpcode::FRAME_ALLOC)) .addSym(FrameAllocSym) .addFrameIndex(Alloc); return nullptr; } case Intrinsic::framerecover: { // i8* @llvm.framerecover(i8* %fn, i8* %fp) MachineFunction &MF = DAG.getMachineFunction(); MVT PtrVT = TLI.getPointerTy(0); // Get the symbol that defines the frame offset. Function *Fn = cast(I.getArgOperand(0)->stripPointerCasts()); MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol(Fn->getName()); // Create a TargetExternalSymbol for the label to avoid any target lowering // that would make this PC relative. StringRef Name = FrameAllocSym->getName(); assert(Name.size() == strlen(Name.data()) && "not null terminated"); SDValue OffsetSym = DAG.getTargetExternalSymbol(Name.data(), PtrVT); SDValue OffsetVal = DAG.getNode(ISD::FRAME_ALLOC_RECOVER, sdl, PtrVT, OffsetSym); // Add the offset to the FP. Value *FP = I.getArgOperand(1); SDValue FPVal = getValue(FP); SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal); setValue(&I, Add); return nullptr; } } } std::pair SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, MachineBasicBlock *LandingPad) { MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI(); MCSymbol *BeginLabel = nullptr; if (LandingPad) { // Insert a label before the invoke call to mark the try range. This can be // used to detect deletion of the invoke via the MachineModuleInfo. BeginLabel = MMI.getContext().CreateTempSymbol(); // For SjLj, keep track of which landing pads go with which invokes // so as to maintain the ordering of pads in the LSDA. unsigned CallSiteIndex = MMI.getCurrentCallSite(); if (CallSiteIndex) { MMI.setCallSiteBeginLabel(BeginLabel, CallSiteIndex); LPadToCallSiteMap[LandingPad].push_back(CallSiteIndex); // Now that the call site is handled, stop tracking it. MMI.setCurrentCallSite(0); } // Both PendingLoads and PendingExports must be flushed here; // this call might not return. (void)getRoot(); DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel)); CLI.setChain(getRoot()); } const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering(); std::pair Result = TLI->LowerCallTo(CLI); assert((CLI.IsTailCall || Result.second.getNode()) && "Non-null chain expected with non-tail call!"); assert((Result.second.getNode() || !Result.first.getNode()) && "Null value expected with tail call!"); if (!Result.second.getNode()) { // As a special case, a null chain means that a tail call has been emitted // and the DAG root is already updated. HasTailCall = true; // Since there's no actual continuation from this block, nothing can be // relying on us setting vregs for them. PendingExports.clear(); } else { DAG.setRoot(Result.second); } if (LandingPad) { // Insert a label at the end of the invoke call to mark the try range. This // can be used to detect deletion of the invoke via the MachineModuleInfo. MCSymbol *EndLabel = MMI.getContext().CreateTempSymbol(); DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel)); // Inform MachineModuleInfo of range. MMI.addInvoke(LandingPad, BeginLabel, EndLabel); } return Result; } void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool isTailCall, MachineBasicBlock *LandingPad) { PointerType *PT = cast(CS.getCalledValue()->getType()); FunctionType *FTy = cast(PT->getElementType()); Type *RetTy = FTy->getReturnType(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Args.reserve(CS.arg_size()); for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { const Value *V = *i; // Skip empty types if (V->getType()->isEmptyTy()) continue; SDValue ArgNode = getValue(V); Entry.Node = ArgNode; Entry.Ty = V->getType(); // Skip the first return-type Attribute to get to params. Entry.setAttributes(&CS, i - CS.arg_begin() + 1); Args.push_back(Entry); } // Check if target-independent constraints permit a tail call here. // Target-dependent constraints are checked within TLI->LowerCallTo. if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget())) isTailCall = false; TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot()) .setCallee(RetTy, FTy, Callee, std::move(Args), CS) .setTailCall(isTailCall); std::pair Result = lowerInvokable(CLI, LandingPad); if (Result.first.getNode()) setValue(CS.getInstruction(), Result.first); } /// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the /// value is equal or not-equal to zero. static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { for (const User *U : V->users()) { if (const ICmpInst *IC = dyn_cast(U)) if (IC->isEquality()) if (const Constant *C = dyn_cast(IC->getOperand(1))) if (C->isNullValue()) continue; // Unknown instruction. return false; } return true; } static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, Type *LoadTy, SelectionDAGBuilder &Builder) { // Check to see if this load can be trivially constant folded, e.g. if the // input is from a string literal. if (const Constant *LoadInput = dyn_cast(PtrVal)) { // Cast pointer to the type we really want to load. LoadInput = ConstantExpr::getBitCast(const_cast(LoadInput), PointerType::getUnqual(LoadTy)); if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(const_cast(LoadInput), Builder.DL)) return Builder.getValue(LoadCst); } // Otherwise, we have to emit the load. If the pointer is to unfoldable but // still constant memory, the input chain can be the entry node. SDValue Root; bool ConstantMemory = false; // Do not serialize (non-volatile) loads of constant memory with anything. if (Builder.AA->pointsToConstantMemory(PtrVal)) { Root = Builder.DAG.getEntryNode(); ConstantMemory = true; } else { // Do not serialize non-volatile loads against each other. Root = Builder.DAG.getRoot(); } SDValue Ptr = Builder.getValue(PtrVal); SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root, Ptr, MachinePointerInfo(PtrVal), false /*volatile*/, false /*nontemporal*/, false /*isinvariant*/, 1 /* align=1 */); if (!ConstantMemory) Builder.PendingLoads.push_back(LoadVal.getValue(1)); return LoadVal; } /// processIntegerCallValue - Record the value for an instruction that /// produces an integer result, converting the type where necessary. void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, SDValue Value, bool IsSigned) { EVT VT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true); if (IsSigned) Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT); else Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT); setValue(&I, Value); } /// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form. /// If so, return true and lower it, otherwise return false and it will be /// lowered like a normal call. bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { // Verify that the prototype makes sense. int memcmp(void*,void*,size_t) if (I.getNumArgOperands() != 3) return false; const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() || !I.getArgOperand(2)->getType()->isIntegerTy() || !I.getType()->isIntegerTy()) return false; const Value *Size = I.getArgOperand(2); const ConstantInt *CSize = dyn_cast(Size); if (CSize && CSize->getZExtValue() == 0) { EVT CallVT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true); setValue(&I, DAG.getConstant(0, CallVT)); return true; } const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); std::pair Res = TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS), getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, true); PendingLoads.push_back(Res.second); return true; } // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) { bool ActuallyDoIt = true; MVT LoadVT; Type *LoadTy; switch (CSize->getZExtValue()) { default: LoadVT = MVT::Other; LoadTy = nullptr; ActuallyDoIt = false; break; case 2: LoadVT = MVT::i16; LoadTy = Type::getInt16Ty(CSize->getContext()); break; case 4: LoadVT = MVT::i32; LoadTy = Type::getInt32Ty(CSize->getContext()); break; case 8: LoadVT = MVT::i64; LoadTy = Type::getInt64Ty(CSize->getContext()); break; /* case 16: LoadVT = MVT::v4i32; LoadTy = Type::getInt32Ty(CSize->getContext()); LoadTy = VectorType::get(LoadTy, 4); break; */ } // This turns into unaligned loads. We only do this if the target natively // supports the MVT we'll be loading or if it is small enough (<= 4) that // we'll only produce a small number of byte loads. // Require that we can find a legal MVT, and only do this if the target // supports unaligned loads of that type. Expanding into byte loads would // bloat the code. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ActuallyDoIt && CSize->getZExtValue() > 4) { unsigned DstAS = LHS->getType()->getPointerAddressSpace(); unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); // TODO: Handle 5 byte compare as 4-byte + 1 byte. // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. // TODO: Check alignment of src and dest ptrs. if (!TLI.isTypeLegal(LoadVT) || !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS)) ActuallyDoIt = false; } if (ActuallyDoIt) { SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, ISD::SETNE); processIntegerCallValue(I, Res, false); return true; } } return false; } /// visitMemChrCall -- See if we can lower a memchr call into an optimized /// form. If so, return true and lower it, otherwise return false and it /// will be lowered like a normal call. bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { // Verify that the prototype makes sense. void *memchr(void *, int, size_t) if (I.getNumArgOperands() != 3) return false; const Value *Src = I.getArgOperand(0); const Value *Char = I.getArgOperand(1); const Value *Length = I.getArgOperand(2); if (!Src->getType()->isPointerTy() || !Char->getType()->isIntegerTy() || !Length->getType()->isIntegerTy() || !I.getType()->isPointerTy()) return false; const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); std::pair Res = TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(), getValue(Src), getValue(Char), getValue(Length), MachinePointerInfo(Src)); if (Res.first.getNode()) { setValue(&I, Res.first); PendingLoads.push_back(Res.second); return true; } return false; } /// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an /// optimized form. If so, return true and lower it, otherwise return false /// and it will be lowered like a normal call. bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { // Verify that the prototype makes sense. char *strcpy(char *, char *) if (I.getNumArgOperands() != 2) return false; const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); if (!Arg0->getType()->isPointerTy() || !Arg1->getType()->isPointerTy() || !I.getType()->isPointerTy()) return false; const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); std::pair Res = TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(), getValue(Arg0), getValue(Arg1), MachinePointerInfo(Arg0), MachinePointerInfo(Arg1), isStpcpy); if (Res.first.getNode()) { setValue(&I, Res.first); DAG.setRoot(Res.second); return true; } return false; } /// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form. /// If so, return true and lower it, otherwise return false and it will be /// lowered like a normal call. bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { // Verify that the prototype makes sense. int strcmp(void*,void*) if (I.getNumArgOperands() != 2) return false; const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); if (!Arg0->getType()->isPointerTy() || !Arg1->getType()->isPointerTy() || !I.getType()->isIntegerTy()) return false; const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); std::pair Res = TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(), getValue(Arg0), getValue(Arg1), MachinePointerInfo(Arg0), MachinePointerInfo(Arg1)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, true); PendingLoads.push_back(Res.second); return true; } return false; } /// visitStrLenCall -- See if we can lower a strlen call into an optimized /// form. If so, return true and lower it, otherwise return false and it /// will be lowered like a normal call. bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { // Verify that the prototype makes sense. size_t strlen(char *) if (I.getNumArgOperands() != 1) return false; const Value *Arg0 = I.getArgOperand(0); if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy()) return false; const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); std::pair Res = TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(), getValue(Arg0), MachinePointerInfo(Arg0)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, false); PendingLoads.push_back(Res.second); return true; } return false; } /// visitStrNLenCall -- See if we can lower a strnlen call into an optimized /// form. If so, return true and lower it, otherwise return false and it /// will be lowered like a normal call. bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { // Verify that the prototype makes sense. size_t strnlen(char *, size_t) if (I.getNumArgOperands() != 2) return false; const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); if (!Arg0->getType()->isPointerTy() || !Arg1->getType()->isIntegerTy() || !I.getType()->isIntegerTy()) return false; const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); std::pair Res = TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(), getValue(Arg0), getValue(Arg1), MachinePointerInfo(Arg0)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, false); PendingLoads.push_back(Res.second); return true; } return false; } /// visitUnaryFloatCall - If a call instruction is a unary floating-point /// operation (as expected), translate it to an SDNode with the specified opcode /// and return true. bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { // Sanity check that it really is a unary floating-point call. if (I.getNumArgOperands() != 1 || !I.getArgOperand(0)->getType()->isFloatingPointTy() || I.getType() != I.getArgOperand(0)->getType() || !I.onlyReadsMemory()) return false; SDValue Tmp = getValue(I.getArgOperand(0)); setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp)); return true; } /// visitBinaryFloatCall - If a call instruction is a binary floating-point /// operation (as expected), translate it to an SDNode with the specified opcode /// and return true. bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { // Sanity check that it really is a binary floating-point call. if (I.getNumArgOperands() != 2 || !I.getArgOperand(0)->getType()->isFloatingPointTy() || I.getType() != I.getArgOperand(0)->getType() || I.getType() != I.getArgOperand(1)->getType() || !I.onlyReadsMemory()) return false; SDValue Tmp0 = getValue(I.getArgOperand(0)); SDValue Tmp1 = getValue(I.getArgOperand(1)); EVT VT = Tmp0.getValueType(); setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1)); return true; } void SelectionDAGBuilder::visitCall(const CallInst &I) { // Handle inline assembly differently. if (isa(I.getCalledValue())) { visitInlineAsm(&I); return; } MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI(); ComputeUsesVAFloatArgument(I, &MMI); const char *RenameFn = nullptr; if (Function *F = I.getCalledFunction()) { if (F->isDeclaration()) { if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) { if (unsigned IID = II->getIntrinsicID(F)) { RenameFn = visitIntrinsicCall(I, IID); if (!RenameFn) return; } } if (unsigned IID = F->getIntrinsicID()) { RenameFn = visitIntrinsicCall(I, IID); if (!RenameFn) return; } } // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. LibFunc::Func Func; if (!F->hasLocalLinkage() && F->hasName() && LibInfo->getLibFunc(F->getName(), Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; case LibFunc::copysign: case LibFunc::copysignf: case LibFunc::copysignl: if (I.getNumArgOperands() == 2 && // Basic sanity checks. I.getArgOperand(0)->getType()->isFloatingPointTy() && I.getType() == I.getArgOperand(0)->getType() && I.getType() == I.getArgOperand(1)->getType() && I.onlyReadsMemory()) { SDValue LHS = getValue(I.getArgOperand(0)); SDValue RHS = getValue(I.getArgOperand(1)); setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(), LHS.getValueType(), LHS, RHS)); return; } break; case LibFunc::fabs: case LibFunc::fabsf: case LibFunc::fabsl: if (visitUnaryFloatCall(I, ISD::FABS)) return; break; case LibFunc::fmin: case LibFunc::fminf: case LibFunc::fminl: if (visitBinaryFloatCall(I, ISD::FMINNUM)) return; break; case LibFunc::fmax: case LibFunc::fmaxf: case LibFunc::fmaxl: if (visitBinaryFloatCall(I, ISD::FMAXNUM)) return; break; case LibFunc::sin: case LibFunc::sinf: case LibFunc::sinl: if (visitUnaryFloatCall(I, ISD::FSIN)) return; break; case LibFunc::cos: case LibFunc::cosf: case LibFunc::cosl: if (visitUnaryFloatCall(I, ISD::FCOS)) return; break; case LibFunc::sqrt: case LibFunc::sqrtf: case LibFunc::sqrtl: case LibFunc::sqrt_finite: case LibFunc::sqrtf_finite: case LibFunc::sqrtl_finite: if (visitUnaryFloatCall(I, ISD::FSQRT)) return; break; case LibFunc::floor: case LibFunc::floorf: case LibFunc::floorl: if (visitUnaryFloatCall(I, ISD::FFLOOR)) return; break; case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl: if (visitUnaryFloatCall(I, ISD::FNEARBYINT)) return; break; case LibFunc::ceil: case LibFunc::ceilf: case LibFunc::ceill: if (visitUnaryFloatCall(I, ISD::FCEIL)) return; break; case LibFunc::rint: case LibFunc::rintf: case LibFunc::rintl: if (visitUnaryFloatCall(I, ISD::FRINT)) return; break; case LibFunc::round: case LibFunc::roundf: case LibFunc::roundl: if (visitUnaryFloatCall(I, ISD::FROUND)) return; break; case LibFunc::trunc: case LibFunc::truncf: case LibFunc::truncl: if (visitUnaryFloatCall(I, ISD::FTRUNC)) return; break; case LibFunc::log2: case LibFunc::log2f: case LibFunc::log2l: if (visitUnaryFloatCall(I, ISD::FLOG2)) return; break; case LibFunc::exp2: case LibFunc::exp2f: case LibFunc::exp2l: if (visitUnaryFloatCall(I, ISD::FEXP2)) return; break; case LibFunc::memcmp: if (visitMemCmpCall(I)) return; break; case LibFunc::memchr: if (visitMemChrCall(I)) return; break; case LibFunc::strcpy: if (visitStrCpyCall(I, false)) return; break; case LibFunc::stpcpy: if (visitStrCpyCall(I, true)) return; break; case LibFunc::strcmp: if (visitStrCmpCall(I)) return; break; case LibFunc::strlen: if (visitStrLenCall(I)) return; break; case LibFunc::strnlen: if (visitStrNLenCall(I)) return; break; } } } SDValue Callee; if (!RenameFn) Callee = getValue(I.getCalledValue()); else Callee = DAG.getExternalSymbol(RenameFn, DAG.getTargetLoweringInfo().getPointerTy()); // Check if we can potentially perform a tail call. More detailed checking is // be done within LowerCallTo, after more information about the call is known. LowerCallTo(&I, Callee, I.isTailCall()); } namespace { /// AsmOperandInfo - This contains information for each constraint that we are /// lowering. class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo { public: /// CallOperand - If this is the result output operand or a clobber /// this is null, otherwise it is the incoming operand to the CallInst. /// This gets modified as the asm is processed. SDValue CallOperand; /// AssignedRegs - If this is a register or register class operand, this /// contains the set of register corresponding to the operand. RegsForValue AssignedRegs; explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info) : TargetLowering::AsmOperandInfo(info), CallOperand(nullptr,0) { } /// getCallOperandValEVT - Return the EVT of the Value* that this operand /// corresponds to. If there is no Value* for this operand, it returns /// MVT::Other. EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI, const DataLayout *DL) const { if (!CallOperandVal) return MVT::Other; if (isa(CallOperandVal)) return TLI.getPointerTy(); llvm::Type *OpTy = CallOperandVal->getType(); // FIXME: code duplicated from TargetLowering::ParseConstraints(). // If this is an indirect operand, the operand is a pointer to the // accessed type. if (isIndirect) { llvm::PointerType *PtrTy = dyn_cast(OpTy); if (!PtrTy) report_fatal_error("Indirect operand for inline asm not a pointer!"); OpTy = PtrTy->getElementType(); } // Look for vector wrapped in a struct. e.g. { <16 x i8> }. if (StructType *STy = dyn_cast(OpTy)) if (STy->getNumElements() == 1) OpTy = STy->getElementType(0); // If OpTy is not a single value, it may be a struct/union that we // can tile with integers. if (!OpTy->isSingleValueType() && OpTy->isSized()) { unsigned BitSize = DL->getTypeSizeInBits(OpTy); switch (BitSize) { default: break; case 1: case 8: case 16: case 32: case 64: case 128: OpTy = IntegerType::get(Context, BitSize); break; } } return TLI.getValueType(OpTy, true); } }; typedef SmallVector SDISelAsmOperandInfoVector; } // end anonymous namespace /// GetRegistersForValue - Assign registers (virtual or physical) for the /// specified operand. We prefer to assign virtual registers, to allow the /// register allocator to handle the assignment process. However, if the asm /// uses features that we can't model on machineinstrs, we have SDISel do the /// allocation. This produces generally horrible, but correct, code. /// /// OpInfo describes the operand. /// static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, SDLoc DL, SDISelAsmOperandInfo &OpInfo) { LLVMContext &Context = *DAG.getContext(); MachineFunction &MF = DAG.getMachineFunction(); SmallVector Regs; // If this is a constraint for a single physreg, or a constraint for a // register class, find it. std::pair PhysReg = TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT); unsigned NumRegs = 1; if (OpInfo.ConstraintVT != MVT::Other) { // If this is a FP input in an integer register (or visa versa) insert a bit // cast of the input value. More generally, handle any case where the input // value disagrees with the register class we plan to stick this in. if (OpInfo.Type == InlineAsm::isInput && PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) { // Try to convert to the first EVT that the reg class contains. If the // types are identical size, use a bitcast to convert (e.g. two differing // vector types). MVT RegVT = *PhysReg.second->vt_begin(); if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) { OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand); OpInfo.ConstraintVT = RegVT; } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) { // If the input is a FP value and we want it in FP registers, do a // bitcast to the corresponding integer type. This turns an f64 value // into i64, which can be passed with two i32 values on a 32-bit // machine. RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits()); OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand); OpInfo.ConstraintVT = RegVT; } } NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT); } MVT RegVT; EVT ValueVT = OpInfo.ConstraintVT; // If this is a constraint for a specific physical register, like {r17}, // assign it now. if (unsigned AssignedReg = PhysReg.first) { const TargetRegisterClass *RC = PhysReg.second; if (OpInfo.ConstraintVT == MVT::Other) ValueVT = *RC->vt_begin(); // Get the actual register value type. This is important, because the user // may have asked for (e.g.) the AX register in i32 type. We need to // remember that AX is actually i16 to get the right extension. RegVT = *RC->vt_begin(); // This is a explicit reference to a physical register. Regs.push_back(AssignedReg); // If this is an expanded reference, add the rest of the regs to Regs. if (NumRegs != 1) { TargetRegisterClass::iterator I = RC->begin(); for (; *I != AssignedReg; ++I) assert(I != RC->end() && "Didn't find reg!"); // Already added the first reg. --NumRegs; ++I; for (; NumRegs; --NumRegs, ++I) { assert(I != RC->end() && "Ran out of registers to allocate!"); Regs.push_back(*I); } } OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT); return; } // Otherwise, if this was a reference to an LLVM register class, create vregs // for this reference. if (const TargetRegisterClass *RC = PhysReg.second) { RegVT = *RC->vt_begin(); if (OpInfo.ConstraintVT == MVT::Other) ValueVT = RegVT; // Create the appropriate number of virtual registers. MachineRegisterInfo &RegInfo = MF.getRegInfo(); for (; NumRegs; --NumRegs) Regs.push_back(RegInfo.createVirtualRegister(RC)); OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT); return; } // Otherwise, we couldn't allocate enough registers for this. } /// visitInlineAsm - Handle a call to an InlineAsm object. /// void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { const InlineAsm *IA = cast(CS.getCalledValue()); /// ConstraintOperands - Information about all of the constraints. SDISelAsmOperandInfoVector ConstraintOperands; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(CS); bool hasMemory = false; unsigned ArgNo = 0; // ArgNo - The argument of the CallInst. unsigned ResNo = 0; // ResNo - The result number of the next output. for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { ConstraintOperands.push_back(SDISelAsmOperandInfo(TargetConstraints[i])); SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back(); MVT OpVT = MVT::Other; // Compute the value type for each operand. switch (OpInfo.Type) { case InlineAsm::isOutput: // Indirect outputs just consume an argument. if (OpInfo.isIndirect) { OpInfo.CallOperandVal = const_cast(CS.getArgument(ArgNo++)); break; } // The return value of the call is this value. As such, there is no // corresponding argument. assert(!CS.getType()->isVoidTy() && "Bad inline asm!"); if (StructType *STy = dyn_cast(CS.getType())) { OpVT = TLI.getSimpleValueType(STy->getElementType(ResNo)); } else { assert(ResNo == 0 && "Asm only has one result!"); OpVT = TLI.getSimpleValueType(CS.getType()); } ++ResNo; break; case InlineAsm::isInput: OpInfo.CallOperandVal = const_cast(CS.getArgument(ArgNo++)); break; case InlineAsm::isClobber: // Nothing to do. break; } // If this is an input or an indirect output, process the call argument. // BasicBlocks are labels, currently appearing only in asm's. if (OpInfo.CallOperandVal) { if (const BasicBlock *BB = dyn_cast(OpInfo.CallOperandVal)) { OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]); } else { OpInfo.CallOperand = getValue(OpInfo.CallOperandVal); } OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, DL).getSimpleVT(); } OpInfo.ConstraintVT = OpVT; // Indirect operand accesses access memory. if (OpInfo.isIndirect) hasMemory = true; else { for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) { TargetLowering::ConstraintType CType = TLI.getConstraintType(OpInfo.Codes[j]); if (CType == TargetLowering::C_Memory) { hasMemory = true; break; } } } } SDValue Chain, Flag; // We won't need to flush pending loads if this asm doesn't touch // memory and is nonvolatile. if (hasMemory || IA->hasSideEffects()) Chain = getRoot(); else Chain = DAG.getRoot(); // Second pass over the constraints: compute which constraint option to use // and assign registers to constraints that want a specific physreg. for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) { SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i]; // If this is an output operand with a matching input operand, look up the // matching input. If their types mismatch, e.g. one is an integer, the // other is floating point, or their sizes are different, flag it as an // error. if (OpInfo.hasMatchingInput()) { SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput]; if (OpInfo.ConstraintVT != Input.ConstraintVT) { std::pair MatchRC = TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT); std::pair InputRC = TLI.getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT); if ((OpInfo.ConstraintVT.isInteger() != Input.ConstraintVT.isInteger()) || (MatchRC.second != InputRC.second)) { report_fatal_error("Unsupported asm: input constraint" " with a matching output constraint of" " incompatible type!"); } Input.ConstraintVT = OpInfo.ConstraintVT; } } // Compute the constraint code and ConstraintType to use. TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG); if (OpInfo.ConstraintType == TargetLowering::C_Memory && OpInfo.Type == InlineAsm::isClobber) continue; // If this is a memory input, and if the operand is not indirect, do what we // need to to provide an address for the memory input. if (OpInfo.ConstraintType == TargetLowering::C_Memory && !OpInfo.isIndirect) { assert((OpInfo.isMultipleAlternative || (OpInfo.Type == InlineAsm::isInput)) && "Can only indirectify direct input operands!"); // Memory operands really want the address of the value. If we don't have // an indirect input, put it in the constpool if we can, otherwise spill // it to a stack slot. // TODO: This isn't quite right. We need to handle these according to // the addressing mode that the constraint wants. Also, this may take // an additional register for the computation and we don't want that // either. // If the operand is a float, integer, or vector constant, spill to a // constant pool entry to get its address. const Value *OpVal = OpInfo.CallOperandVal; if (isa(OpVal) || isa(OpVal) || isa(OpVal) || isa(OpVal)) { OpInfo.CallOperand = DAG.getConstantPool(cast(OpVal), TLI.getPointerTy()); } else { // Otherwise, create a stack slot and emit a store to it before the // asm. Type *Ty = OpVal->getType(); uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty); unsigned Align = TLI.getDataLayout()->getPrefTypeAlignment(Ty); MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy()); Chain = DAG.getStore(Chain, getCurSDLoc(), OpInfo.CallOperand, StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, 0); OpInfo.CallOperand = StackSlot; } // There is no longer a Value* corresponding to this operand. OpInfo.CallOperandVal = nullptr; // It is now an indirect operand. OpInfo.isIndirect = true; } // If this constraint is for a specific register, allocate it before // anything else. if (OpInfo.ConstraintType == TargetLowering::C_Register) GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo); } // Second pass - Loop over all of the operands, assigning virtual or physregs // to register class operands. for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) { SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i]; // C_Register operands have already been allocated, Other/Memory don't need // to be. if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass) GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo); } // AsmNodeOperands - The operands for the ISD::INLINEASM node. std::vector AsmNodeOperands; AsmNodeOperands.push_back(SDValue()); // reserve space for input chain AsmNodeOperands.push_back( DAG.getTargetExternalSymbol(IA->getAsmString().c_str(), TLI.getPointerTy())); // If we have a !srcloc metadata node associated with it, we want to attach // this to the ultimately generated inline asm machineinstr. To do this, we // pass in the third operand as this (potentially null) inline asm MDNode. const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc"); AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc)); // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore // bits as operand 3. unsigned ExtraInfo = 0; if (IA->hasSideEffects()) ExtraInfo |= InlineAsm::Extra_HasSideEffects; if (IA->isAlignStack()) ExtraInfo |= InlineAsm::Extra_IsAlignStack; // Set the asm dialect. ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect; // Determine if this InlineAsm MayLoad or MayStore based on the constraints. for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. TLI.ComputeConstraintToUse(OpInfo, SDValue()); // Ideally, we would only check against memory constraints. However, the // meaning of an other constraint can be target-specific and we can't easily // reason about it. Therefore, be conservative and set MayLoad/MayStore // for other constriants as well. if (OpInfo.ConstraintType == TargetLowering::C_Memory || OpInfo.ConstraintType == TargetLowering::C_Other) { if (OpInfo.Type == InlineAsm::isInput) ExtraInfo |= InlineAsm::Extra_MayLoad; else if (OpInfo.Type == InlineAsm::isOutput) ExtraInfo |= InlineAsm::Extra_MayStore; else if (OpInfo.Type == InlineAsm::isClobber) ExtraInfo |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore); } } AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo, TLI.getPointerTy())); // Loop over all of the inputs, copying the operand values into the // appropriate registers and processing the output regs. RegsForValue RetValRegs; // IndirectStoresToEmit - The set of stores to emit after the inline asm node. std::vector > IndirectStoresToEmit; for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) { SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i]; switch (OpInfo.Type) { case InlineAsm::isOutput: { if (OpInfo.ConstraintType != TargetLowering::C_RegisterClass && OpInfo.ConstraintType != TargetLowering::C_Register) { // Memory output, or 'other' output (e.g. 'X' constraint). assert(OpInfo.isIndirect && "Memory output must be indirect operand"); // Add information to the INLINEASM node to know about this output. unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1); AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, TLI.getPointerTy())); AsmNodeOperands.push_back(OpInfo.CallOperand); break; } // Otherwise, this is a register or register class output. // Copy the output from the appropriate register. Find a register that // we can use. if (OpInfo.AssignedRegs.Regs.empty()) { LLVMContext &Ctx = *DAG.getContext(); Ctx.emitError(CS.getInstruction(), "couldn't allocate output register for constraint '" + Twine(OpInfo.ConstraintCode) + "'"); return; } // If this is an indirect operand, store through the pointer after the // asm. if (OpInfo.isIndirect) { IndirectStoresToEmit.push_back(std::make_pair(OpInfo.AssignedRegs, OpInfo.CallOperandVal)); } else { // This is the result value of the call. assert(!CS.getType()->isVoidTy() && "Bad inline asm!"); // Concatenate this output onto the outputs list. RetValRegs.append(OpInfo.AssignedRegs); } // Add information to the INLINEASM node to know that this register is // set. OpInfo.AssignedRegs .AddInlineAsmOperands(OpInfo.isEarlyClobber ? InlineAsm::Kind_RegDefEarlyClobber : InlineAsm::Kind_RegDef, false, 0, DAG, AsmNodeOperands); break; } case InlineAsm::isInput: { SDValue InOperandVal = OpInfo.CallOperand; if (OpInfo.isMatchingInputConstraint()) { // Matching constraint? // If this is required to match an output register we have already set, // just use its register. unsigned OperandNo = OpInfo.getMatchedOperand(); // Scan until we find the definition we already emitted of this operand. // When we find it, create a RegsForValue operand. unsigned CurOp = InlineAsm::Op_FirstOperand; for (; OperandNo; --OperandNo) { // Advance to the next operand. unsigned OpFlag = cast(AsmNodeOperands[CurOp])->getZExtValue(); assert((InlineAsm::isRegDefKind(OpFlag) || InlineAsm::isRegDefEarlyClobberKind(OpFlag) || InlineAsm::isMemKind(OpFlag)) && "Skipped past definitions?"); CurOp += InlineAsm::getNumOperandRegisters(OpFlag)+1; } unsigned OpFlag = cast(AsmNodeOperands[CurOp])->getZExtValue(); if (InlineAsm::isRegDefKind(OpFlag) || InlineAsm::isRegDefEarlyClobberKind(OpFlag)) { // Add (OpFlag&0xffff)>>3 registers to MatchedRegs. if (OpInfo.isIndirect) { // This happens on gcc/testsuite/gcc.dg/pr8788-1.c LLVMContext &Ctx = *DAG.getContext(); Ctx.emitError(CS.getInstruction(), "inline asm not supported yet:" " don't know how to handle tied " "indirect register inputs"); return; } RegsForValue MatchedRegs; MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType()); MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType(); MatchedRegs.RegVTs.push_back(RegVT); MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo(); for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag); i != e; ++i) { if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT)) MatchedRegs.Regs.push_back(RegInfo.createVirtualRegister(RC)); else { LLVMContext &Ctx = *DAG.getContext(); Ctx.emitError(CS.getInstruction(), "inline asm error: This value" " type register class is not natively supported!"); return; } } // Use the produced MatchedRegs object to MatchedRegs.getCopyToRegs(InOperandVal, DAG, getCurSDLoc(), Chain, &Flag, CS.getInstruction()); MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, true, OpInfo.getMatchedOperand(), DAG, AsmNodeOperands); break; } assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!"); assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 && "Unexpected number of operands"); // Add information to the INLINEASM node to know about this input. // See InlineAsm.h isUseOperandTiedToDef. OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag, OpInfo.getMatchedOperand()); AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag, TLI.getPointerTy())); AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]); break; } // Treat indirect 'X' constraint as memory. if (OpInfo.ConstraintType == TargetLowering::C_Other && OpInfo.isIndirect) OpInfo.ConstraintType = TargetLowering::C_Memory; if (OpInfo.ConstraintType == TargetLowering::C_Other) { std::vector Ops; TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode, Ops, DAG); if (Ops.empty()) { LLVMContext &Ctx = *DAG.getContext(); Ctx.emitError(CS.getInstruction(), "invalid operand for inline asm constraint '" + Twine(OpInfo.ConstraintCode) + "'"); return; } // Add information to the INLINEASM node to know about this input. unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size()); AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType, TLI.getPointerTy())); AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end()); break; } if (OpInfo.ConstraintType == TargetLowering::C_Memory) { assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!"); assert(InOperandVal.getValueType() == TLI.getPointerTy() && "Memory operands expect pointer values"); // Add information to the INLINEASM node to know about this input. unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1); AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType, TLI.getPointerTy())); AsmNodeOperands.push_back(InOperandVal); break; } assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass || OpInfo.ConstraintType == TargetLowering::C_Register) && "Unknown constraint type!"); // TODO: Support this. if (OpInfo.isIndirect) { LLVMContext &Ctx = *DAG.getContext(); Ctx.emitError(CS.getInstruction(), "Don't know how to handle indirect register inputs yet " "for constraint '" + Twine(OpInfo.ConstraintCode) + "'"); return; } // Copy the input into the appropriate registers. if (OpInfo.AssignedRegs.Regs.empty()) { LLVMContext &Ctx = *DAG.getContext(); Ctx.emitError(CS.getInstruction(), "couldn't allocate input reg for constraint '" + Twine(OpInfo.ConstraintCode) + "'"); return; } OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, getCurSDLoc(), Chain, &Flag, CS.getInstruction()); OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0, DAG, AsmNodeOperands); break; } case InlineAsm::isClobber: { // Add the clobbered value to the operand list, so that the register // allocator is aware that the physreg got clobbered. if (!OpInfo.AssignedRegs.Regs.empty()) OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber, false, 0, DAG, AsmNodeOperands); break; } } } // Finish up input operands. Set the input chain and add the flag last. AsmNodeOperands[InlineAsm::Op_InputChain] = Chain; if (Flag.getNode()) AsmNodeOperands.push_back(Flag); Chain = DAG.getNode(ISD::INLINEASM, getCurSDLoc(), DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); Flag = Chain.getValue(1); // If this asm returns a register value, copy the result from that register // and set it as the value of the call. if (!RetValRegs.Regs.empty()) { SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction()); // FIXME: Why don't we do this for inline asms with MRVs? if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) { EVT ResultType = TLI.getValueType(CS.getType()); // If any of the results of the inline asm is a vector, it may have the // wrong width/num elts. This can happen for register classes that can // contain multiple different value types. The preg or vreg allocated may // not have the same VT as was expected. Convert it to the right type // with bit_convert. if (ResultType != Val.getValueType() && Val.getValueType().isVector()) { Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(), ResultType, Val); } else if (ResultType != Val.getValueType() && ResultType.isInteger() && Val.getValueType().isInteger()) { // If a result value was tied to an input value, the computed result may // have a wider width than the expected result. Extract the relevant // portion. Val = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultType, Val); } assert(ResultType == Val.getValueType() && "Asm result value mismatch!"); } setValue(CS.getInstruction(), Val); // Don't need to use this as a chain in this case. if (!IA->hasSideEffects() && !hasMemory && IndirectStoresToEmit.empty()) return; } std::vector > StoresToEmit; // Process indirect outputs, first output all of the flagged copies out of // physregs. for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) { RegsForValue &OutRegs = IndirectStoresToEmit[i].first; const Value *Ptr = IndirectStoresToEmit[i].second; SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, IA); StoresToEmit.push_back(std::make_pair(OutVal, Ptr)); } // Emit the non-flagged stores from the physregs. SmallVector OutChains; for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) { SDValue Val = DAG.getStore(Chain, getCurSDLoc(), StoresToEmit[i].first, getValue(StoresToEmit[i].second), MachinePointerInfo(StoresToEmit[i].second), false, false, 0); OutChains.push_back(Val); } if (!OutChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains); DAG.setRoot(Chain); } void SelectionDAGBuilder::visitVAStart(const CallInst &I) { DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(), MVT::Other, getRoot(), getValue(I.getArgOperand(0)), DAG.getSrcValue(I.getArgOperand(0)))); } void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const DataLayout &DL = *TLI.getDataLayout(); SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurSDLoc(), getRoot(), getValue(I.getOperand(0)), DAG.getSrcValue(I.getOperand(0)), DL.getABITypeAlignment(I.getType())); setValue(&I, V); DAG.setRoot(V.getValue(1)); } void SelectionDAGBuilder::visitVAEnd(const CallInst &I) { DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(), MVT::Other, getRoot(), getValue(I.getArgOperand(0)), DAG.getSrcValue(I.getArgOperand(0)))); } void SelectionDAGBuilder::visitVACopy(const CallInst &I) { DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(), MVT::Other, getRoot(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), DAG.getSrcValue(I.getArgOperand(0)), DAG.getSrcValue(I.getArgOperand(1)))); } /// \brief Lower an argument list according to the target calling convention. /// /// \return A tuple of /// /// This is a helper for lowering intrinsics that follow a target calling /// convention or require stack pointer adjustment. Only a subset of the /// intrinsic's operands need to participate in the calling convention. std::pair SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx, unsigned NumArgs, SDValue Callee, bool UseVoidTy, MachineBasicBlock *LandingPad, bool IsPatchPoint) { TargetLowering::ArgListTy Args; Args.reserve(NumArgs); // Populate the argument list. // Attributes for args start at offset 1, after the return attribute. for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; ArgI != ArgE; ++ArgI) { const Value *V = CS->getOperand(ArgI); assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); TargetLowering::ArgListEntry Entry; Entry.Node = getValue(V); Entry.Ty = V->getType(); Entry.setAttributes(&CS, AttrI); Args.push_back(Entry); } Type *retTy = UseVoidTy ? Type::getVoidTy(*DAG.getContext()) : CS->getType(); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot()) .setCallee(CS.getCallingConv(), retTy, Callee, std::move(Args), NumArgs) .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint); return lowerInvokable(CLI, LandingPad); } /// \brief Add a stack map intrinsic call's live variable operands to a stackmap /// or patchpoint target node's operand list. /// /// Constants are converted to TargetConstants purely as an optimization to /// avoid constant materialization and register allocation. /// /// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not /// generate addess computation nodes, and so ExpandISelPseudo can convert the /// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids /// address materialization and register allocation, but may also be required /// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an /// alloca in the entry block, then the runtime may assume that the alloca's /// StackMap location can be read immediately after compilation and that the /// location is valid at any point during execution (this is similar to the /// assumption made by the llvm.gcroot intrinsic). If the alloca's location were /// only available in a register, then the runtime would need to trap when /// execution reaches the StackMap in order to read the alloca's location. static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx, SmallVectorImpl &Ops, SelectionDAGBuilder &Builder) { for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) { SDValue OpVal = Builder.getValue(CS.getArgument(i)); if (ConstantSDNode *C = dyn_cast(OpVal)) { Ops.push_back( Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64)); Ops.push_back( Builder.DAG.getTargetConstant(C->getSExtValue(), MVT::i64)); } else if (FrameIndexSDNode *FI = dyn_cast(OpVal)) { const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo(); Ops.push_back( Builder.DAG.getTargetFrameIndex(FI->getIndex(), TLI.getPointerTy())); } else Ops.push_back(OpVal); } } /// \brief Lower llvm.experimental.stackmap directly to its target opcode. void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { // void @llvm.experimental.stackmap(i32 , i32 , // [live variables...]) assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value."); SDValue Chain, InFlag, Callee, NullPtr; SmallVector Ops; SDLoc DL = getCurSDLoc(); Callee = getValue(CI.getCalledValue()); NullPtr = DAG.getIntPtrConstant(0, true); // The stackmap intrinsic only records the live variables (the arguemnts // passed to it) and emits NOPS (if requested). Unlike the patchpoint // intrinsic, this won't be lowered to a function call. This means we don't // have to worry about calling conventions and target specific lowering code. // Instead we perform the call lowering right here. // // chain, flag = CALLSEQ_START(chain, 0) // chain, flag = STACKMAP(id, nbytes, ..., chain, flag) // chain, flag = CALLSEQ_END(chain, 0, 0, flag) // Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL); InFlag = Chain.getValue(1); // Add the and constants. SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos)); Ops.push_back(DAG.getTargetConstant( cast(IDVal)->getZExtValue(), MVT::i64)); SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos)); Ops.push_back(DAG.getTargetConstant( cast(NBytesVal)->getZExtValue(), MVT::i32)); // Push live variables for the stack map. addStackMapLiveVars(&CI, 2, Ops, *this); // We are not pushing any register mask info here on the operands list, // because the stackmap doesn't clobber anything. // Push the chain and the glue flag. Ops.push_back(Chain); Ops.push_back(InFlag); // Create the STACKMAP node. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops); Chain = SDValue(SM, 0); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL); // Stackmaps don't generate values, so nothing goes into the NodeMap. // Set the root to the target-lowered call chain. DAG.setRoot(Chain); // Inform the Frame Information that we have a stackmap in this function. FuncInfo.MF->getFrameInfo()->setHasStackMap(); } /// \brief Lower llvm.experimental.patchpoint directly to its target opcode. void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, MachineBasicBlock *LandingPad) { // void|i64 @llvm.experimental.patchpoint.void|i64(i64 , // i32 , // i8* , // i32 , // [Args...], // [live variables...]) CallingConv::ID CC = CS.getCallingConv(); bool IsAnyRegCC = CC == CallingConv::AnyReg; bool HasDef = !CS->getType()->isVoidTy(); SDValue Callee = getValue(CS->getOperand(2)); // // Get the real number of arguments participating in the call SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos)); unsigned NumArgs = cast(NArgVal)->getZExtValue(); // Skip the four meta args: , , , // Intrinsics include all meta-operands up to but not including CC. unsigned NumMetaOpers = PatchPointOpers::CCPos; assert(CS.arg_size() >= NumMetaOpers + NumArgs && "Not enough arguments provided to the patchpoint intrinsic"); // For AnyRegCC the arguments are lowered later on manually. unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs; std::pair Result = lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, LandingPad, true); SDNode *CallEnd = Result.second.getNode(); if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg)) CallEnd = CallEnd->getOperand(0).getNode(); /// Get a call instruction from the call sequence chain. /// Tail calls are not allowed. assert(CallEnd->getOpcode() == ISD::CALLSEQ_END && "Expected a callseq node."); SDNode *Call = CallEnd->getOperand(0).getNode(); bool HasGlue = Call->getGluedNode(); // Replace the target specific call node with the patchable intrinsic. SmallVector Ops; // Add the and constants. SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos)); Ops.push_back(DAG.getTargetConstant( cast(IDVal)->getZExtValue(), MVT::i64)); SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos)); Ops.push_back(DAG.getTargetConstant( cast(NBytesVal)->getZExtValue(), MVT::i32)); // Assume that the Callee is a constant address. // FIXME: handle function symbols in the future. Ops.push_back( DAG.getIntPtrConstant(cast(Callee)->getZExtValue(), /*isTarget=*/true)); // Adjust to account for any arguments that have been passed on the // stack instead. // Call Node: Chain, Target, {Args}, RegMask, [Glue] unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3); NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs; Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32)); // Add the calling convention Ops.push_back(DAG.getTargetConstant((unsigned)CC, MVT::i32)); // Add the arguments we omitted previously. The register allocator should // place these in any free register. if (IsAnyRegCC) for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i) Ops.push_back(getValue(CS.getArgument(i))); // Push the arguments from the call instruction up to the register mask. SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1; for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i) Ops.push_back(*i); // Push live variables for the stack map. addStackMapLiveVars(CS, NumMetaOpers + NumArgs, Ops, *this); // Push the register mask info. if (HasGlue) Ops.push_back(*(Call->op_end()-2)); else Ops.push_back(*(Call->op_end()-1)); // Push the chain (this is originally the first operand of the call, but // becomes now the last or second to last operand). Ops.push_back(*(Call->op_begin())); // Push the glue flag (last operand). if (HasGlue) Ops.push_back(*(Call->op_end()-1)); SDVTList NodeTys; if (IsAnyRegCC && HasDef) { // Create the return types based on the intrinsic definition const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector ValueVTs; ComputeValueVTs(TLI, CS->getType(), ValueVTs); assert(ValueVTs.size() == 1 && "Expected only one return value type."); // There is always a chain and a glue type at the end ValueVTs.push_back(MVT::Other); ValueVTs.push_back(MVT::Glue); NodeTys = DAG.getVTList(ValueVTs); } else NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); // Replace the target specific call node with a PATCHPOINT node. MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT, getCurSDLoc(), NodeTys, Ops); // Update the NodeMap. if (HasDef) { if (IsAnyRegCC) setValue(CS.getInstruction(), SDValue(MN, 0)); else setValue(CS.getInstruction(), Result.first); } // Fixup the consumers of the intrinsic. The chain and glue may be used in the // call sequence. Furthermore the location of the chain and glue can change // when the AnyReg calling convention is used and the intrinsic returns a // value. if (IsAnyRegCC && HasDef) { SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)}; SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)}; DAG.ReplaceAllUsesOfValuesWith(From, To, 2); } else DAG.ReplaceAllUsesWith(Call, MN); DAG.DeleteNode(Call); // Inform the Frame Information that we have a patchpoint in this function. FuncInfo.MF->getFrameInfo()->setHasPatchPoint(); } /// Returns an AttributeSet representing the attributes applied to the return /// value of the given call. static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { SmallVector Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); if (CLI.RetZExt) Attrs.push_back(Attribute::ZExt); if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, Attrs); } /// TargetLowering::LowerCallTo - This is the default LowerCallTo /// implementation, which just calls LowerCall. /// FIXME: When all targets are /// migrated to using LowerCall, this hook should be integrated into SDISel. std::pair TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // Handle the incoming return values from the call. CLI.Ins.clear(); Type *OrigRetTy = CLI.RetTy; SmallVector RetTys; SmallVector Offsets; ComputeValueVTs(*this, CLI.RetTy, RetTys, &Offsets); SmallVector Outs; GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this); bool CanLowerReturn = this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(), CLI.IsVarArg, Outs, CLI.RetTy->getContext()); SDValue DemoteStackSlot; int DemoteStackIdx = -100; if (!CanLowerReturn) { // FIXME: equivalent assert? // assert(!CS.hasInAllocaArgument() && // "sret demotion is incompatible with inalloca"); uint64_t TySize = getDataLayout()->getTypeAllocSize(CLI.RetTy); unsigned Align = getDataLayout()->getPrefTypeAlignment(CLI.RetTy); MachineFunction &MF = CLI.DAG.getMachineFunction(); DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false); Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy); DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy()); ArgListEntry Entry; Entry.Node = DemoteStackSlot; Entry.Ty = StackSlotPtrType; Entry.isSExt = false; Entry.isZExt = false; Entry.isInReg = false; Entry.isSRet = true; Entry.isNest = false; Entry.isByVal = false; Entry.isReturned = false; Entry.Alignment = Align; CLI.getArgs().insert(CLI.getArgs().begin(), Entry); CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext()); } else { for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { EVT VT = RetTys[I]; MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT); unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { ISD::InputArg MyFlags; MyFlags.VT = RegisterVT; MyFlags.ArgVT = VT; MyFlags.Used = CLI.IsReturnValueUsed; if (CLI.RetSExt) MyFlags.Flags.setSExt(); if (CLI.RetZExt) MyFlags.Flags.setZExt(); if (CLI.IsInReg) MyFlags.Flags.setInReg(); CLI.Ins.push_back(MyFlags); } } } // Handle all of the outgoing arguments. CLI.Outs.clear(); CLI.OutVals.clear(); ArgListTy &Args = CLI.getArgs(); for (unsigned i = 0, e = Args.size(); i != e; ++i) { SmallVector ValueVTs; ComputeValueVTs(*this, Args[i].Ty, ValueVTs); Type *FinalType = Args[i].Ty; if (Args[i].isByVal) FinalType = cast(Args[i].Ty)->getElementType(); bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( FinalType, CLI.CallConv, CLI.IsVarArg); for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { EVT VT = ValueVTs[Value]; Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext()); SDValue Op = SDValue(Args[i].Node.getNode(), Args[i].Node.getResNo() + Value); ISD::ArgFlagsTy Flags; unsigned OriginalAlignment = getDataLayout()->getABITypeAlignment(ArgTy); if (Args[i].isZExt) Flags.setZExt(); if (Args[i].isSExt) Flags.setSExt(); if (Args[i].isInReg) Flags.setInReg(); if (Args[i].isSRet) Flags.setSRet(); if (Args[i].isByVal) Flags.setByVal(); if (Args[i].isInAlloca) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated // and how many bytes a callee cleanup function will pop. If we port // inalloca to more targets, we'll have to add custom inalloca handling // in the various CC lowering callbacks. Flags.setByVal(); } if (Args[i].isByVal || Args[i].isInAlloca) { PointerType *Ty = cast(Args[i].Ty); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(getDataLayout()->getTypeAllocSize(ElementTy)); // For ByVal, alignment should come from FE. BE will guess if this // info is not there but there are cases it cannot get right. unsigned FrameAlign; if (Args[i].Alignment) FrameAlign = Args[i].Alignment; else FrameAlign = getByValTypeAlignment(ElementTy); Flags.setByValAlign(FrameAlign); } if (Args[i].isNest) Flags.setNest(); if (NeedsRegBlock) { Flags.setInConsecutiveRegs(); if (Value == NumValues - 1) Flags.setInConsecutiveRegsLast(); } Flags.setOrigAlign(OriginalAlignment); MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT); unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT); SmallVector Parts(NumParts); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; if (Args[i].isSExt) ExtendKind = ISD::SIGN_EXTEND; else if (Args[i].isZExt) ExtendKind = ISD::ZERO_EXTEND; // Conservatively only handle 'returned' on non-vectors for now if (Args[i].isReturned && !Op.getValueType().isVector()) { assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues && "unexpected use of 'returned'"); // Before passing 'returned' to the target lowering code, ensure that // either the register MVT and the actual EVT are the same size or that // the return value and argument are extended in the same way; in these // cases it's safe to pass the argument register value unchanged as the // return register value (although it's at the target's option whether // to do so) // TODO: allow code generation to take advantage of partially preserved // registers rather than clobbering the entire register when the // parameter extension method is not compatible with the return // extension method if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) || (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt)) Flags.setReturned(); } getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind); for (unsigned j = 0; j != NumParts; ++j) { // if it isn't first piece, alignment must be 1 ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT, i < CLI.NumFixedArgs, i, j*Parts[j].getValueType().getStoreSize()); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); else if (j != 0) MyFlags.Flags.setOrigAlign(1); CLI.Outs.push_back(MyFlags); CLI.OutVals.push_back(Parts[j]); } } } SmallVector InVals; CLI.Chain = LowerCall(CLI, InVals); // Verify that the target's LowerCall behaved as expected. assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other && "LowerCall didn't return a valid chain!"); assert((!CLI.IsTailCall || InVals.empty()) && "LowerCall emitted a return value for a tail call!"); assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) && "LowerCall didn't emit the correct number of values!"); // For a tail call, the return value is merely live-out and there aren't // any nodes in the DAG representing it. Return a special value to // indicate that a tail call has been emitted and no more Instructions // should be processed in the current block. if (CLI.IsTailCall) { CLI.DAG.setRoot(CLI.Chain); return std::make_pair(SDValue(), SDValue()); } DEBUG(for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) { assert(InVals[i].getNode() && "LowerCall emitted a null value!"); assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() && "LowerCall emitted a value with the wrong type!"); }); SmallVector ReturnValues; if (!CanLowerReturn) { // The instruction result is the result of loading from the // hidden sret parameter. SmallVector PVTs; Type *PtrRetTy = PointerType::getUnqual(OrigRetTy); ComputeValueVTs(*this, PtrRetTy, PVTs); assert(PVTs.size() == 1 && "Pointers should fit in one register"); EVT PtrVT = PVTs[0]; unsigned NumValues = RetTys.size(); ReturnValues.resize(NumValues); SmallVector Chains(NumValues); for (unsigned i = 0; i < NumValues; ++i) { SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], PtrVT)); SDValue L = CLI.DAG.getLoad( RetTys[i], CLI.DL, CLI.Chain, Add, MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]), false, false, false, 1); ReturnValues[i] = L; Chains[i] = L.getValue(1); } CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains); } else { // Collect the legal value parts into potentially illegal values // that correspond to the original function's return values. ISD::NodeType AssertOp = ISD::DELETED_NODE; if (CLI.RetSExt) AssertOp = ISD::AssertSext; else if (CLI.RetZExt) AssertOp = ISD::AssertZext; unsigned CurReg = 0; for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { EVT VT = RetTys[I]; MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT); unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT); ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr, AssertOp)); CurReg += NumRegs; } // For a function returning void, there is no return value. We can't create // such a node, so we just return a null return value in that case. In // that case, nothing will actually look at the value. if (ReturnValues.empty()) return std::make_pair(SDValue(), CLI.Chain); } SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL, CLI.DAG.getVTList(RetTys), ReturnValues); return std::make_pair(Res, CLI.Chain); } void TargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res = LowerOperation(SDValue(N, 0), DAG); if (Res.getNode()) Results.push_back(Res); } SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("LowerOperation not implemented for this target!"); } void SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) { SDValue Op = getNonRegisterValue(V); assert((Op.getOpcode() != ISD::CopyFromReg || cast(Op.getOperand(1))->getReg() != Reg) && "Copy from a reg to the same reg!"); assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); RegsForValue RFV(V->getContext(), TLI, Reg, V->getType()); SDValue Chain = DAG.getEntryNode(); ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) == FuncInfo.PreferredExtendType.end()) ? ISD::ANY_EXTEND : FuncInfo.PreferredExtendType[V]; RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType); PendingExports.push_back(Chain); } #include "llvm/CodeGen/SelectionDAGISel.h" /// isOnlyUsedInEntryBlock - If the specified argument is only used in the /// entry block, return true. This includes arguments used by switches, since /// the switch may expand into multiple basic blocks. static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) { // With FastISel active, we may be splitting blocks, so force creation // of virtual registers for all non-dead arguments. if (FastISel) return A->use_empty(); const BasicBlock *Entry = A->getParent()->begin(); for (const User *U : A->users()) if (cast(U)->getParent() != Entry || isa(U)) return false; // Use not in entry block. return true; } void SelectionDAGISel::LowerArguments(const Function &F) { SelectionDAG &DAG = SDB->DAG; SDLoc dl = SDB->getCurSDLoc(); const DataLayout *DL = TLI->getDataLayout(); SmallVector Ins; if (!FuncInfo->CanLowerReturn) { // Put in an sret pointer parameter before all the other parameters. SmallVector ValueVTs; ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs); // NOTE: Assuming that a pointer will never break down to more than one VT // or one register. ISD::ArgFlagsTy Flags; Flags.setSRet(); MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]); ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0); Ins.push_back(RetArg); } // Set up the incoming argument description vector. unsigned Idx = 1; for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, ++Idx) { SmallVector ValueVTs; ComputeValueVTs(*TLI, I->getType(), ValueVTs); bool isArgValueUsed = !I->use_empty(); unsigned PartBase = 0; Type *FinalType = I->getType(); if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) FinalType = cast(FinalType)->getElementType(); bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( FinalType, F.getCallingConv(), F.isVarArg()); for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { EVT VT = ValueVTs[Value]; Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); ISD::ArgFlagsTy Flags; unsigned OriginalAlignment = DL->getABITypeAlignment(ArgTy); if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) Flags.setZExt(); if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) Flags.setSExt(); if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) Flags.setInReg(); if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet)) Flags.setSRet(); if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) Flags.setByVal(); if (F.getAttributes().hasAttribute(Idx, Attribute::InAlloca)) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated // and how many bytes a callee cleanup function will pop. If we port // inalloca to more targets, we'll have to add custom inalloca handling // in the various CC lowering callbacks. Flags.setByVal(); } if (Flags.isByVal() || Flags.isInAlloca()) { PointerType *Ty = cast(I->getType()); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL->getTypeAllocSize(ElementTy)); // For ByVal, alignment should be passed from FE. BE will guess if // this info is not there but there are cases it cannot get right. unsigned FrameAlign; if (F.getParamAlignment(Idx)) FrameAlign = F.getParamAlignment(Idx); else FrameAlign = TLI->getByValTypeAlignment(ElementTy); Flags.setByValAlign(FrameAlign); } if (F.getAttributes().hasAttribute(Idx, Attribute::Nest)) Flags.setNest(); if (NeedsRegBlock) { Flags.setInConsecutiveRegs(); if (Value == NumValues - 1) Flags.setInConsecutiveRegsLast(); } Flags.setOrigAlign(OriginalAlignment); MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT); unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed, Idx-1, PartBase+i*RegisterVT.getStoreSize()); if (NumRegs > 1 && i == 0) MyFlags.Flags.setSplit(); // if it isn't first piece, alignment must be 1 else if (i > 0) MyFlags.Flags.setOrigAlign(1); Ins.push_back(MyFlags); } PartBase += VT.getStoreSize(); } } // Call the target to set up the argument values. SmallVector InVals; SDValue NewRoot = TLI->LowerFormalArguments( DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals); // Verify that the target's LowerFormalArguments behaved as expected. assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other && "LowerFormalArguments didn't return a valid chain!"); assert(InVals.size() == Ins.size() && "LowerFormalArguments didn't emit the correct number of values!"); DEBUG({ for (unsigned i = 0, e = Ins.size(); i != e; ++i) { assert(InVals[i].getNode() && "LowerFormalArguments emitted a null value!"); assert(EVT(Ins[i].VT) == InVals[i].getValueType() && "LowerFormalArguments emitted a value with the wrong type!"); } }); // Update the DAG with the new chain value resulting from argument lowering. DAG.setRoot(NewRoot); // Set up the argument values. unsigned i = 0; Idx = 1; if (!FuncInfo->CanLowerReturn) { // Create a virtual register for the sret pointer, and put in a copy // from the sret argument into it. SmallVector ValueVTs; ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs); MVT VT = ValueVTs[0].getSimpleVT(); MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT); ISD::NodeType AssertOp = ISD::DELETED_NODE; SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, nullptr, AssertOp); MachineFunction& MF = SDB->DAG.getMachineFunction(); MachineRegisterInfo& RegInfo = MF.getRegInfo(); unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT)); FuncInfo->DemoteRegister = SRetReg; NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue); DAG.setRoot(NewRoot); // i indexes lowered arguments. Bump it past the hidden sret argument. // Idx indexes LLVM arguments. Don't touch it. ++i; } for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, ++Idx) { SmallVector ArgValues; SmallVector ValueVTs; ComputeValueVTs(*TLI, I->getType(), ValueVTs); unsigned NumValues = ValueVTs.size(); // If this argument is unused then remember its value. It is used to generate // debugging information. if (I->use_empty() && NumValues) { SDB->setUnusedArgValue(I, InVals[i]); // Also remember any frame index for use in FastISel. if (FrameIndexSDNode *FI = dyn_cast(InVals[i].getNode())) FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); } for (unsigned Val = 0; Val != NumValues; ++Val) { EVT VT = ValueVTs[Val]; MVT PartVT = TLI->getRegisterType(*CurDAG->getContext(), VT); unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT); if (!I->use_empty()) { ISD::NodeType AssertOp = ISD::DELETED_NODE; if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) AssertOp = ISD::AssertSext; else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) AssertOp = ISD::AssertZext; ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, PartVT, VT, nullptr, AssertOp)); } i += NumParts; } // We don't need to do anything else for unused arguments. if (ArgValues.empty()) continue; // Note down frame index. if (FrameIndexSDNode *FI = dyn_cast(ArgValues[0].getNode())) FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues), SDB->getCurSDLoc()); SDB->setValue(I, Res); if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) { if (LoadSDNode *LNode = dyn_cast(Res.getOperand(0).getNode())) if (FrameIndexSDNode *FI = dyn_cast(LNode->getBasePtr().getNode())) FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); } // If this argument is live outside of the entry block, insert a copy from // wherever we got it to the vreg that other BB's will reference it as. if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) { // If we can, though, try to skip creating an unnecessary vreg. // FIXME: This isn't very clean... it would be nice to make this more // general. It's also subtly incompatible with the hacks FastISel // uses with vregs. unsigned Reg = cast(Res.getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { FuncInfo->ValueMap[I] = Reg; continue; } } if (!isOnlyUsedInEntryBlock(I, TM.Options.EnableFastISel)) { FuncInfo->InitializeRegForValue(I); SDB->CopyToExportRegsIfNeeded(I); } } assert(i == InVals.size() && "Argument register count mismatch!"); // Finally, if the target has anything special to do, allow it to do so. // FIXME: this should insert code into the DAG! EmitFunctionEntryCode(); } /// Handle PHI nodes in successor blocks. Emit code into the SelectionDAG to /// ensure constants are generated when needed. Remember the virtual registers /// that need to be added to the Machine PHI nodes as input. We cannot just /// directly add them, because expansion might result in multiple MBB's for one /// BB. As such, the start of the BB might correspond to a different MBB than /// the end. /// void SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { const TerminatorInst *TI = LLVMBB->getTerminator(); SmallPtrSet SuccsHandled; // Check successor nodes' PHI nodes that expect a constant to be available // from this block. for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) { const BasicBlock *SuccBB = TI->getSuccessor(succ); if (!isa(SuccBB->begin())) continue; MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB]; // If this terminator has multiple identical successors (common for // switches), only handle each succ once. if (!SuccsHandled.insert(SuccMBB).second) continue; MachineBasicBlock::iterator MBBI = SuccMBB->begin(); // At this point we know that there is a 1-1 correspondence between LLVM PHI // nodes and Machine PHI nodes, but the incoming operands have not been // emitted yet. for (BasicBlock::const_iterator I = SuccBB->begin(); const PHINode *PN = dyn_cast(I); ++I) { // Ignore dead phi's. if (PN->use_empty()) continue; // Skip empty types if (PN->getType()->isEmptyTy()) continue; unsigned Reg; const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB); if (const Constant *C = dyn_cast(PHIOp)) { unsigned &RegOut = ConstantsOut[C]; if (RegOut == 0) { RegOut = FuncInfo.CreateRegs(C->getType()); CopyValueToVirtualRegister(C, RegOut); } Reg = RegOut; } else { DenseMap::iterator I = FuncInfo.ValueMap.find(PHIOp); if (I != FuncInfo.ValueMap.end()) Reg = I->second; else { assert(isa(PHIOp) && FuncInfo.StaticAllocaMap.count(cast(PHIOp)) && "Didn't codegen value into a register!??"); Reg = FuncInfo.CreateRegs(PHIOp->getType()); CopyValueToVirtualRegister(PHIOp, Reg); } } // Remember that this register needs to added to the machine PHI node as // the input for this MBB. SmallVector ValueVTs; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); ComputeValueVTs(TLI, PN->getType(), ValueVTs); for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) { EVT VT = ValueVTs[vti]; unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT); for (unsigned i = 0, e = NumRegisters; i != e; ++i) FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i)); Reg += NumRegisters; } } } ConstantsOut.clear(); } /// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB /// is 0. MachineBasicBlock * SelectionDAGBuilder::StackProtectorDescriptor:: AddSuccessorMBB(const BasicBlock *BB, MachineBasicBlock *ParentMBB, bool IsLikely, MachineBasicBlock *SuccMBB) { // If SuccBB has not been created yet, create it. if (!SuccMBB) { MachineFunction *MF = ParentMBB->getParent(); MachineFunction::iterator BBI = ParentMBB; SuccMBB = MF->CreateMachineBasicBlock(BB); MF->insert(++BBI, SuccMBB); } // Add it as a successor of ParentMBB. ParentMBB->addSuccessor( SuccMBB, BranchProbabilityInfo::getBranchWeightStackProtector(IsLikely)); return SuccMBB; } Index: projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RTDyldMemoryManager.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RTDyldMemoryManager.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RTDyldMemoryManager.cpp (nonexistent) @@ -1,294 +0,0 @@ -//===-- RTDyldMemoryManager.cpp - Memory manager for MC-JIT -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Implementation of the runtime dynamic memory manager base class. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Config/config.h" -#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/DynamicLibrary.h" -#include "llvm/Support/ErrorHandling.h" -#include - -#ifdef __linux__ - // These includes used by RTDyldMemoryManager::getPointerToNamedFunction() - // for Glibc trickery. See comments in this function for more information. - #ifdef HAVE_SYS_STAT_H - #include - #endif - #include - #include -#endif - -namespace llvm { - -RTDyldMemoryManager::~RTDyldMemoryManager() {} - -// Determine whether we can register EH tables. -#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \ - !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)) -#define HAVE_EHTABLE_SUPPORT 1 -#else -#define HAVE_EHTABLE_SUPPORT 0 -#endif - -#if HAVE_EHTABLE_SUPPORT -extern "C" void __register_frame(void*); -extern "C" void __deregister_frame(void*); -#else -// The building compiler does not have __(de)register_frame but -// it may be found at runtime in a dynamically-loaded library. -// For example, this happens when building LLVM with Visual C++ -// but using the MingW runtime. -void __register_frame(void *p) { - static bool Searched = false; - static void *rf = 0; - - if (!Searched) { - Searched = true; - rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol( - "__register_frame"); - } - if (rf) - ((void (*)(void *))rf)(p); -} - -void __deregister_frame(void *p) { - static bool Searched = false; - static void *df = 0; - - if (!Searched) { - Searched = true; - df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol( - "__deregister_frame"); - } - if (df) - ((void (*)(void *))df)(p); -} -#endif - -#ifdef __APPLE__ - -static const char *processFDE(const char *Entry, bool isDeregister) { - const char *P = Entry; - uint32_t Length = *((const uint32_t *)P); - P += 4; - uint32_t Offset = *((const uint32_t *)P); - if (Offset != 0) { - if (isDeregister) - __deregister_frame(const_cast(Entry)); - else - __register_frame(const_cast(Entry)); - } - return P + Length; -} - -// This implementation handles frame registration for local targets. -// Memory managers for remote targets should re-implement this function -// and use the LoadAddr parameter. -void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, - uint64_t LoadAddr, - size_t Size) { - // On OS X OS X __register_frame takes a single FDE as an argument. - // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html - const char *P = (const char *)Addr; - const char *End = P + Size; - do { - P = processFDE(P, false); - } while(P != End); -} - -void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr, - uint64_t LoadAddr, - size_t Size) { - const char *P = (const char *)Addr; - const char *End = P + Size; - do { - P = processFDE(P, true); - } while(P != End); -} - -#else - -void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, - uint64_t LoadAddr, - size_t Size) { - // On Linux __register_frame takes a single argument: - // a pointer to the start of the .eh_frame section. - - // How can it find the end? Because crtendS.o is linked - // in and it has an .eh_frame section with four zero chars. - __register_frame(Addr); -} - -void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr, - uint64_t LoadAddr, - size_t Size) { - __deregister_frame(Addr); -} - -#endif - -static int jit_noop() { - return 0; -} - -// ARM math functions are statically linked on Android from libgcc.a, but not -// available at runtime for dynamic linking. On Linux these are usually placed -// in libgcc_s.so so can be found by normal dynamic lookup. -#if defined(__BIONIC__) && defined(__arm__) -// List of functions which are statically linked on Android and can be generated -// by LLVM. This is done as a nested macro which is used once to declare the -// imported functions with ARM_MATH_DECL and once to compare them to the -// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test -// assumes that all functions start with __aeabi_ and getSymbolAddress must be -// modified if that changes. -#define ARM_MATH_IMPORTS(PP) \ - PP(__aeabi_d2f) \ - PP(__aeabi_d2iz) \ - PP(__aeabi_d2lz) \ - PP(__aeabi_d2uiz) \ - PP(__aeabi_d2ulz) \ - PP(__aeabi_dadd) \ - PP(__aeabi_dcmpeq) \ - PP(__aeabi_dcmpge) \ - PP(__aeabi_dcmpgt) \ - PP(__aeabi_dcmple) \ - PP(__aeabi_dcmplt) \ - PP(__aeabi_dcmpun) \ - PP(__aeabi_ddiv) \ - PP(__aeabi_dmul) \ - PP(__aeabi_dsub) \ - PP(__aeabi_f2d) \ - PP(__aeabi_f2iz) \ - PP(__aeabi_f2lz) \ - PP(__aeabi_f2uiz) \ - PP(__aeabi_f2ulz) \ - PP(__aeabi_fadd) \ - PP(__aeabi_fcmpeq) \ - PP(__aeabi_fcmpge) \ - PP(__aeabi_fcmpgt) \ - PP(__aeabi_fcmple) \ - PP(__aeabi_fcmplt) \ - PP(__aeabi_fcmpun) \ - PP(__aeabi_fdiv) \ - PP(__aeabi_fmul) \ - PP(__aeabi_fsub) \ - PP(__aeabi_i2d) \ - PP(__aeabi_i2f) \ - PP(__aeabi_idiv) \ - PP(__aeabi_idivmod) \ - PP(__aeabi_l2d) \ - PP(__aeabi_l2f) \ - PP(__aeabi_lasr) \ - PP(__aeabi_ldivmod) \ - PP(__aeabi_llsl) \ - PP(__aeabi_llsr) \ - PP(__aeabi_lmul) \ - PP(__aeabi_ui2d) \ - PP(__aeabi_ui2f) \ - PP(__aeabi_uidiv) \ - PP(__aeabi_uidivmod) \ - PP(__aeabi_ul2d) \ - PP(__aeabi_ul2f) \ - PP(__aeabi_uldivmod) - -// Declare statically linked math functions on ARM. The function declarations -// here do not have the correct prototypes for each function in -// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are -// needed. In particular the __aeabi_*divmod functions do not have calling -// conventions which match any C prototype. -#define ARM_MATH_DECL(name) extern "C" void name(); -ARM_MATH_IMPORTS(ARM_MATH_DECL) -#undef ARM_MATH_DECL -#endif - -#if defined(__linux__) && defined(__GLIBC__) && \ - (defined(__i386__) || defined(__x86_64__)) -extern "C" LLVM_ATTRIBUTE_WEAK void __morestack(); -#endif - -uint64_t -RTDyldMemoryManager::getSymbolAddressInProcess(const std::string &Name) { - // This implementation assumes that the host program is the target. - // Clients generating code for a remote target should implement their own - // memory manager. -#if defined(__linux__) && defined(__GLIBC__) - //===--------------------------------------------------------------------===// - // Function stubs that are invoked instead of certain library calls - // - // Force the following functions to be linked in to anything that uses the - // JIT. This is a hack designed to work around the all-too-clever Glibc - // strategy of making these functions work differently when inlined vs. when - // not inlined, and hiding their real definitions in a separate archive file - // that the dynamic linker can't see. For more info, search for - // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274. - if (Name == "stat") return (uint64_t)&stat; - if (Name == "fstat") return (uint64_t)&fstat; - if (Name == "lstat") return (uint64_t)&lstat; - if (Name == "stat64") return (uint64_t)&stat64; - if (Name == "fstat64") return (uint64_t)&fstat64; - if (Name == "lstat64") return (uint64_t)&lstat64; - if (Name == "atexit") return (uint64_t)&atexit; - if (Name == "mknod") return (uint64_t)&mknod; - -#if defined(__i386__) || defined(__x86_64__) - // __morestack lives in libgcc, a static library. - if (&__morestack && Name == "__morestack") - return (uint64_t)&__morestack; -#endif -#endif // __linux__ && __GLIBC__ - - // See ARM_MATH_IMPORTS definition for explanation -#if defined(__BIONIC__) && defined(__arm__) - if (Name.compare(0, 8, "__aeabi_") == 0) { - // Check if the user has requested any of the functions listed in - // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol. -#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn; - ARM_MATH_IMPORTS(ARM_MATH_CHECK) -#undef ARM_MATH_CHECK - } -#endif - - // We should not invoke parent's ctors/dtors from generated main()! - // On Mingw and Cygwin, the symbol __main is resolved to - // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors - // (and register wrong callee's dtors with atexit(3)). - // We expect ExecutionEngine::runStaticConstructorsDestructors() - // is called before ExecutionEngine::runFunctionAsMain() is called. - if (Name == "__main") return (uint64_t)&jit_noop; - - // Try to demangle Name before looking it up in the process, otherwise symbol - // '_' (if present) will shadow '', and there will be no way to - // refer to the latter. - - const char *NameStr = Name.c_str(); - - if (NameStr[0] == '_') - if (void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr + 1)) - return (uint64_t)Ptr; - - // If we Name did not require demangling, or we failed to find the demangled - // name, try again without demangling. - return (uint64_t)sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr); -} - -void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name, - bool AbortOnFailure) { - uint64_t Addr = getSymbolAddress(Name); - - if (!Addr && AbortOnFailure) - report_fatal_error("Program used external function '" + Name + - "' which could not be resolved!"); - return (void*)Addr; -} - -} // namespace llvm Index: projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp (nonexistent) +++ projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp (revision 279025) @@ -0,0 +1,294 @@ +//===-- RTDyldMemoryManager.cpp - Memory manager for MC-JIT -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the runtime dynamic memory manager base class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Config/config.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/ErrorHandling.h" +#include + +#ifdef __linux__ + // These includes used by RTDyldMemoryManager::getPointerToNamedFunction() + // for Glibc trickery. See comments in this function for more information. + #ifdef HAVE_SYS_STAT_H + #include + #endif + #include + #include +#endif + +namespace llvm { + +RTDyldMemoryManager::~RTDyldMemoryManager() {} + +// Determine whether we can register EH tables. +#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \ + !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)) +#define HAVE_EHTABLE_SUPPORT 1 +#else +#define HAVE_EHTABLE_SUPPORT 0 +#endif + +#if HAVE_EHTABLE_SUPPORT +extern "C" void __register_frame(void*); +extern "C" void __deregister_frame(void*); +#else +// The building compiler does not have __(de)register_frame but +// it may be found at runtime in a dynamically-loaded library. +// For example, this happens when building LLVM with Visual C++ +// but using the MingW runtime. +void __register_frame(void *p) { + static bool Searched = false; + static void *rf = 0; + + if (!Searched) { + Searched = true; + rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol( + "__register_frame"); + } + if (rf) + ((void (*)(void *))rf)(p); +} + +void __deregister_frame(void *p) { + static bool Searched = false; + static void *df = 0; + + if (!Searched) { + Searched = true; + df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol( + "__deregister_frame"); + } + if (df) + ((void (*)(void *))df)(p); +} +#endif + +#ifdef __APPLE__ + +static const char *processFDE(const char *Entry, bool isDeregister) { + const char *P = Entry; + uint32_t Length = *((const uint32_t *)P); + P += 4; + uint32_t Offset = *((const uint32_t *)P); + if (Offset != 0) { + if (isDeregister) + __deregister_frame(const_cast(Entry)); + else + __register_frame(const_cast(Entry)); + } + return P + Length; +} + +// This implementation handles frame registration for local targets. +// Memory managers for remote targets should re-implement this function +// and use the LoadAddr parameter. +void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, + uint64_t LoadAddr, + size_t Size) { + // On OS X OS X __register_frame takes a single FDE as an argument. + // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html + const char *P = (const char *)Addr; + const char *End = P + Size; + do { + P = processFDE(P, false); + } while(P != End); +} + +void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr, + uint64_t LoadAddr, + size_t Size) { + const char *P = (const char *)Addr; + const char *End = P + Size; + do { + P = processFDE(P, true); + } while(P != End); +} + +#else + +void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, + uint64_t LoadAddr, + size_t Size) { + // On Linux __register_frame takes a single argument: + // a pointer to the start of the .eh_frame section. + + // How can it find the end? Because crtendS.o is linked + // in and it has an .eh_frame section with four zero chars. + __register_frame(Addr); +} + +void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr, + uint64_t LoadAddr, + size_t Size) { + __deregister_frame(Addr); +} + +#endif + +static int jit_noop() { + return 0; +} + +// ARM math functions are statically linked on Android from libgcc.a, but not +// available at runtime for dynamic linking. On Linux these are usually placed +// in libgcc_s.so so can be found by normal dynamic lookup. +#if defined(__BIONIC__) && defined(__arm__) +// List of functions which are statically linked on Android and can be generated +// by LLVM. This is done as a nested macro which is used once to declare the +// imported functions with ARM_MATH_DECL and once to compare them to the +// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test +// assumes that all functions start with __aeabi_ and getSymbolAddress must be +// modified if that changes. +#define ARM_MATH_IMPORTS(PP) \ + PP(__aeabi_d2f) \ + PP(__aeabi_d2iz) \ + PP(__aeabi_d2lz) \ + PP(__aeabi_d2uiz) \ + PP(__aeabi_d2ulz) \ + PP(__aeabi_dadd) \ + PP(__aeabi_dcmpeq) \ + PP(__aeabi_dcmpge) \ + PP(__aeabi_dcmpgt) \ + PP(__aeabi_dcmple) \ + PP(__aeabi_dcmplt) \ + PP(__aeabi_dcmpun) \ + PP(__aeabi_ddiv) \ + PP(__aeabi_dmul) \ + PP(__aeabi_dsub) \ + PP(__aeabi_f2d) \ + PP(__aeabi_f2iz) \ + PP(__aeabi_f2lz) \ + PP(__aeabi_f2uiz) \ + PP(__aeabi_f2ulz) \ + PP(__aeabi_fadd) \ + PP(__aeabi_fcmpeq) \ + PP(__aeabi_fcmpge) \ + PP(__aeabi_fcmpgt) \ + PP(__aeabi_fcmple) \ + PP(__aeabi_fcmplt) \ + PP(__aeabi_fcmpun) \ + PP(__aeabi_fdiv) \ + PP(__aeabi_fmul) \ + PP(__aeabi_fsub) \ + PP(__aeabi_i2d) \ + PP(__aeabi_i2f) \ + PP(__aeabi_idiv) \ + PP(__aeabi_idivmod) \ + PP(__aeabi_l2d) \ + PP(__aeabi_l2f) \ + PP(__aeabi_lasr) \ + PP(__aeabi_ldivmod) \ + PP(__aeabi_llsl) \ + PP(__aeabi_llsr) \ + PP(__aeabi_lmul) \ + PP(__aeabi_ui2d) \ + PP(__aeabi_ui2f) \ + PP(__aeabi_uidiv) \ + PP(__aeabi_uidivmod) \ + PP(__aeabi_ul2d) \ + PP(__aeabi_ul2f) \ + PP(__aeabi_uldivmod) + +// Declare statically linked math functions on ARM. The function declarations +// here do not have the correct prototypes for each function in +// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are +// needed. In particular the __aeabi_*divmod functions do not have calling +// conventions which match any C prototype. +#define ARM_MATH_DECL(name) extern "C" void name(); +ARM_MATH_IMPORTS(ARM_MATH_DECL) +#undef ARM_MATH_DECL +#endif + +#if defined(__linux__) && defined(__GLIBC__) && \ + (defined(__i386__) || defined(__x86_64__)) +extern "C" LLVM_ATTRIBUTE_WEAK void __morestack(); +#endif + +uint64_t +RTDyldMemoryManager::getSymbolAddressInProcess(const std::string &Name) { + // This implementation assumes that the host program is the target. + // Clients generating code for a remote target should implement their own + // memory manager. +#if defined(__linux__) && defined(__GLIBC__) + //===--------------------------------------------------------------------===// + // Function stubs that are invoked instead of certain library calls + // + // Force the following functions to be linked in to anything that uses the + // JIT. This is a hack designed to work around the all-too-clever Glibc + // strategy of making these functions work differently when inlined vs. when + // not inlined, and hiding their real definitions in a separate archive file + // that the dynamic linker can't see. For more info, search for + // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274. + if (Name == "stat") return (uint64_t)&stat; + if (Name == "fstat") return (uint64_t)&fstat; + if (Name == "lstat") return (uint64_t)&lstat; + if (Name == "stat64") return (uint64_t)&stat64; + if (Name == "fstat64") return (uint64_t)&fstat64; + if (Name == "lstat64") return (uint64_t)&lstat64; + if (Name == "atexit") return (uint64_t)&atexit; + if (Name == "mknod") return (uint64_t)&mknod; + +#if defined(__i386__) || defined(__x86_64__) + // __morestack lives in libgcc, a static library. + if (&__morestack && Name == "__morestack") + return (uint64_t)&__morestack; +#endif +#endif // __linux__ && __GLIBC__ + + // See ARM_MATH_IMPORTS definition for explanation +#if defined(__BIONIC__) && defined(__arm__) + if (Name.compare(0, 8, "__aeabi_") == 0) { + // Check if the user has requested any of the functions listed in + // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol. +#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn; + ARM_MATH_IMPORTS(ARM_MATH_CHECK) +#undef ARM_MATH_CHECK + } +#endif + + // We should not invoke parent's ctors/dtors from generated main()! + // On Mingw and Cygwin, the symbol __main is resolved to + // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors + // (and register wrong callee's dtors with atexit(3)). + // We expect ExecutionEngine::runStaticConstructorsDestructors() + // is called before ExecutionEngine::runFunctionAsMain() is called. + if (Name == "__main") return (uint64_t)&jit_noop; + + // Try to demangle Name before looking it up in the process, otherwise symbol + // '_' (if present) will shadow '', and there will be no way to + // refer to the latter. + + const char *NameStr = Name.c_str(); + + if (NameStr[0] == '_') + if (void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr + 1)) + return (uint64_t)Ptr; + + // If we Name did not require demangling, or we failed to find the demangled + // name, try again without demangling. + return (uint64_t)sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr); +} + +void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name, + bool AbortOnFailure) { + uint64_t Addr = getSymbolAddress(Name); + + if (!Addr && AbortOnFailure) + report_fatal_error("Program used external function '" + Name + + "' which could not be resolved!"); + return (void*)Addr; +} + +} // namespace llvm Property changes on: projects/clang360-import/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang360-import/contrib/llvm/lib/IR/Constants.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/IR/Constants.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/IR/Constants.cpp (revision 279025) @@ -1,2967 +1,2982 @@ //===-- Constants.cpp - Implement Constant nodes --------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the Constant* classes. // //===----------------------------------------------------------------------===// #include "llvm/IR/Constants.h" #include "ConstantFold.h" #include "LLVMContextImpl.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include #include using namespace llvm; //===----------------------------------------------------------------------===// // Constant Class //===----------------------------------------------------------------------===// void Constant::anchor() { } bool Constant::isNegativeZeroValue() const { // Floating point values have an explicit -0.0 value. if (const ConstantFP *CFP = dyn_cast(this)) return CFP->isZero() && CFP->isNegative(); // Equivalent for a vector of -0.0's. if (const ConstantDataVector *CV = dyn_cast(this)) if (ConstantFP *SplatCFP = dyn_cast_or_null(CV->getSplatValue())) if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative()) return true; // We've already handled true FP case; any other FP vectors can't represent -0.0. if (getType()->isFPOrFPVectorTy()) return false; // Otherwise, just use +0.0. return isNullValue(); } // Return true iff this constant is positive zero (floating point), negative // zero (floating point), or a null value. bool Constant::isZeroValue() const { // Floating point values have an explicit -0.0 value. if (const ConstantFP *CFP = dyn_cast(this)) return CFP->isZero(); // Otherwise, just use +0.0. return isNullValue(); } bool Constant::isNullValue() const { // 0 is null. if (const ConstantInt *CI = dyn_cast(this)) return CI->isZero(); // +0.0 is null. if (const ConstantFP *CFP = dyn_cast(this)) return CFP->isZero() && !CFP->isNegative(); // constant zero is zero for aggregates and cpnull is null for pointers. return isa(this) || isa(this); } bool Constant::isAllOnesValue() const { // Check for -1 integers if (const ConstantInt *CI = dyn_cast(this)) return CI->isMinusOne(); // Check for FP which are bitcasted from -1 integers if (const ConstantFP *CFP = dyn_cast(this)) return CFP->getValueAPF().bitcastToAPInt().isAllOnesValue(); // Check for constant vectors which are splats of -1 values. if (const ConstantVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isAllOnesValue(); // Check for constant vectors which are splats of -1 values. if (const ConstantDataVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isAllOnesValue(); return false; } bool Constant::isOneValue() const { // Check for 1 integers if (const ConstantInt *CI = dyn_cast(this)) return CI->isOne(); // Check for FP which are bitcasted from 1 integers if (const ConstantFP *CFP = dyn_cast(this)) return CFP->getValueAPF().bitcastToAPInt() == 1; // Check for constant vectors which are splats of 1 values. if (const ConstantVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isOneValue(); // Check for constant vectors which are splats of 1 values. if (const ConstantDataVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isOneValue(); return false; } bool Constant::isMinSignedValue() const { // Check for INT_MIN integers if (const ConstantInt *CI = dyn_cast(this)) return CI->isMinValue(/*isSigned=*/true); // Check for FP which are bitcasted from INT_MIN integers if (const ConstantFP *CFP = dyn_cast(this)) return CFP->getValueAPF().bitcastToAPInt().isMinSignedValue(); // Check for constant vectors which are splats of INT_MIN values. if (const ConstantVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isMinSignedValue(); // Check for constant vectors which are splats of INT_MIN values. if (const ConstantDataVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isMinSignedValue(); return false; } bool Constant::isNotMinSignedValue() const { // Check for INT_MIN integers if (const ConstantInt *CI = dyn_cast(this)) return !CI->isMinValue(/*isSigned=*/true); // Check for FP which are bitcasted from INT_MIN integers if (const ConstantFP *CFP = dyn_cast(this)) return !CFP->getValueAPF().bitcastToAPInt().isMinSignedValue(); // Check for constant vectors which are splats of INT_MIN values. if (const ConstantVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isNotMinSignedValue(); // Check for constant vectors which are splats of INT_MIN values. if (const ConstantDataVector *CV = dyn_cast(this)) if (Constant *Splat = CV->getSplatValue()) return Splat->isNotMinSignedValue(); // It *may* contain INT_MIN, we can't tell. return false; } // Constructor to create a '0' constant of arbitrary type... Constant *Constant::getNullValue(Type *Ty) { switch (Ty->getTypeID()) { case Type::IntegerTyID: return ConstantInt::get(Ty, 0); case Type::HalfTyID: return ConstantFP::get(Ty->getContext(), APFloat::getZero(APFloat::IEEEhalf)); case Type::FloatTyID: return ConstantFP::get(Ty->getContext(), APFloat::getZero(APFloat::IEEEsingle)); case Type::DoubleTyID: return ConstantFP::get(Ty->getContext(), APFloat::getZero(APFloat::IEEEdouble)); case Type::X86_FP80TyID: return ConstantFP::get(Ty->getContext(), APFloat::getZero(APFloat::x87DoubleExtended)); case Type::FP128TyID: return ConstantFP::get(Ty->getContext(), APFloat::getZero(APFloat::IEEEquad)); case Type::PPC_FP128TyID: return ConstantFP::get(Ty->getContext(), APFloat(APFloat::PPCDoubleDouble, APInt::getNullValue(128))); case Type::PointerTyID: return ConstantPointerNull::get(cast(Ty)); case Type::StructTyID: case Type::ArrayTyID: case Type::VectorTyID: return ConstantAggregateZero::get(Ty); default: // Function, Label, or Opaque type? llvm_unreachable("Cannot create a null constant of that type!"); } } Constant *Constant::getIntegerValue(Type *Ty, const APInt &V) { Type *ScalarTy = Ty->getScalarType(); // Create the base integer constant. Constant *C = ConstantInt::get(Ty->getContext(), V); // Convert an integer to a pointer, if necessary. if (PointerType *PTy = dyn_cast(ScalarTy)) C = ConstantExpr::getIntToPtr(C, PTy); // Broadcast a scalar to a vector, if necessary. if (VectorType *VTy = dyn_cast(Ty)) C = ConstantVector::getSplat(VTy->getNumElements(), C); return C; } Constant *Constant::getAllOnesValue(Type *Ty) { if (IntegerType *ITy = dyn_cast(Ty)) return ConstantInt::get(Ty->getContext(), APInt::getAllOnesValue(ITy->getBitWidth())); if (Ty->isFloatingPointTy()) { APFloat FL = APFloat::getAllOnesValue(Ty->getPrimitiveSizeInBits(), !Ty->isPPC_FP128Ty()); return ConstantFP::get(Ty->getContext(), FL); } VectorType *VTy = cast(Ty); return ConstantVector::getSplat(VTy->getNumElements(), getAllOnesValue(VTy->getElementType())); } /// getAggregateElement - For aggregates (struct/array/vector) return the /// constant that corresponds to the specified element if possible, or null if /// not. This can return null if the element index is a ConstantExpr, or if /// 'this' is a constant expr. Constant *Constant::getAggregateElement(unsigned Elt) const { if (const ConstantStruct *CS = dyn_cast(this)) return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : nullptr; if (const ConstantArray *CA = dyn_cast(this)) return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : nullptr; if (const ConstantVector *CV = dyn_cast(this)) return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : nullptr; - if (const ConstantAggregateZero *CAZ =dyn_cast(this)) - return CAZ->getElementValue(Elt); + if (const ConstantAggregateZero *CAZ = dyn_cast(this)) + return Elt < CAZ->getNumElements() ? CAZ->getElementValue(Elt) : nullptr; if (const UndefValue *UV = dyn_cast(this)) - return UV->getElementValue(Elt); + return Elt < UV->getNumElements() ? UV->getElementValue(Elt) : nullptr; if (const ConstantDataSequential *CDS =dyn_cast(this)) return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt) : nullptr; return nullptr; } Constant *Constant::getAggregateElement(Constant *Elt) const { assert(isa(Elt->getType()) && "Index must be an integer"); if (ConstantInt *CI = dyn_cast(Elt)) return getAggregateElement(CI->getZExtValue()); return nullptr; } void Constant::destroyConstantImpl() { // When a Constant is destroyed, there may be lingering // references to the constant by other constants in the constant pool. These // constants are implicitly dependent on the module that is being deleted, // but they don't know that. Because we only find out when the CPV is // deleted, we must now notify all of our users (that should only be // Constants) that they are, in fact, invalid now and should be deleted. // while (!use_empty()) { Value *V = user_back(); #ifndef NDEBUG // Only in -g mode... if (!isa(V)) { dbgs() << "While deleting: " << *this << "\n\nUse still stuck around after Def is destroyed: " << *V << "\n\n"; } #endif assert(isa(V) && "References remain to Constant being destroyed"); cast(V)->destroyConstant(); // The constant should remove itself from our use list... assert((use_empty() || user_back() != V) && "Constant not removed!"); } // Value has no outstanding references it is safe to delete it now... delete this; } static bool canTrapImpl(const Constant *C, SmallPtrSetImpl &NonTrappingOps) { assert(C->getType()->isFirstClassType() && "Cannot evaluate aggregate vals!"); // The only thing that could possibly trap are constant exprs. const ConstantExpr *CE = dyn_cast(C); if (!CE) return false; // ConstantExpr traps if any operands can trap. for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { if (ConstantExpr *Op = dyn_cast(CE->getOperand(i))) { if (NonTrappingOps.insert(Op).second && canTrapImpl(Op, NonTrappingOps)) return true; } } // Otherwise, only specific operations can trap. switch (CE->getOpcode()) { default: return false; case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: // Div and rem can trap if the RHS is not known to be non-zero. if (!isa(CE->getOperand(1)) ||CE->getOperand(1)->isNullValue()) return true; return false; } } /// canTrap - Return true if evaluation of this constant could trap. This is /// true for things like constant expressions that could divide by zero. bool Constant::canTrap() const { SmallPtrSet NonTrappingOps; return canTrapImpl(this, NonTrappingOps); } /// Check if C contains a GlobalValue for which Predicate is true. static bool ConstHasGlobalValuePredicate(const Constant *C, bool (*Predicate)(const GlobalValue *)) { SmallPtrSet Visited; SmallVector WorkList; WorkList.push_back(C); Visited.insert(C); while (!WorkList.empty()) { const Constant *WorkItem = WorkList.pop_back_val(); if (const auto *GV = dyn_cast(WorkItem)) if (Predicate(GV)) return true; for (const Value *Op : WorkItem->operands()) { const Constant *ConstOp = dyn_cast(Op); if (!ConstOp) continue; if (Visited.insert(ConstOp).second) WorkList.push_back(ConstOp); } } return false; } /// Return true if the value can vary between threads. bool Constant::isThreadDependent() const { auto DLLImportPredicate = [](const GlobalValue *GV) { return GV->isThreadLocal(); }; return ConstHasGlobalValuePredicate(this, DLLImportPredicate); } bool Constant::isDLLImportDependent() const { auto DLLImportPredicate = [](const GlobalValue *GV) { return GV->hasDLLImportStorageClass(); }; return ConstHasGlobalValuePredicate(this, DLLImportPredicate); } /// Return true if the constant has users other than constant exprs and other /// dangling things. bool Constant::isConstantUsed() const { for (const User *U : users()) { const Constant *UC = dyn_cast(U); if (!UC || isa(UC)) return true; if (UC->isConstantUsed()) return true; } return false; } /// getRelocationInfo - This method classifies the entry according to /// whether or not it may generate a relocation entry. This must be /// conservative, so if it might codegen to a relocatable entry, it should say /// so. The return values are: /// /// NoRelocation: This constant pool entry is guaranteed to never have a /// relocation applied to it (because it holds a simple constant like /// '4'). /// LocalRelocation: This entry has relocations, but the entries are /// guaranteed to be resolvable by the static linker, so the dynamic /// linker will never see them. /// GlobalRelocations: This entry may have arbitrary relocations. /// /// FIXME: This really should not be in IR. Constant::PossibleRelocationsTy Constant::getRelocationInfo() const { if (const GlobalValue *GV = dyn_cast(this)) { if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) return LocalRelocation; // Local to this file/library. return GlobalRelocations; // Global reference. } if (const BlockAddress *BA = dyn_cast(this)) return BA->getFunction()->getRelocationInfo(); // While raw uses of blockaddress need to be relocated, differences between // two of them don't when they are for labels in the same function. This is a // common idiom when creating a table for the indirect goto extension, so we // handle it efficiently here. if (const ConstantExpr *CE = dyn_cast(this)) if (CE->getOpcode() == Instruction::Sub) { ConstantExpr *LHS = dyn_cast(CE->getOperand(0)); ConstantExpr *RHS = dyn_cast(CE->getOperand(1)); if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt && RHS->getOpcode() == Instruction::PtrToInt && isa(LHS->getOperand(0)) && isa(RHS->getOperand(0)) && cast(LHS->getOperand(0))->getFunction() == cast(RHS->getOperand(0))->getFunction()) return NoRelocation; } PossibleRelocationsTy Result = NoRelocation; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) Result = std::max(Result, cast(getOperand(i))->getRelocationInfo()); return Result; } /// removeDeadUsersOfConstant - If the specified constantexpr is dead, remove /// it. This involves recursively eliminating any dead users of the /// constantexpr. static bool removeDeadUsersOfConstant(const Constant *C) { if (isa(C)) return false; // Cannot remove this while (!C->use_empty()) { const Constant *User = dyn_cast(C->user_back()); if (!User) return false; // Non-constant usage; if (!removeDeadUsersOfConstant(User)) return false; // Constant wasn't dead } const_cast(C)->destroyConstant(); return true; } /// removeDeadConstantUsers - If there are any dead constant users dangling /// off of this constant, remove them. This method is useful for clients /// that want to check to see if a global is unused, but don't want to deal /// with potentially dead constants hanging off of the globals. void Constant::removeDeadConstantUsers() const { Value::const_user_iterator I = user_begin(), E = user_end(); Value::const_user_iterator LastNonDeadUser = E; while (I != E) { const Constant *User = dyn_cast(*I); if (!User) { LastNonDeadUser = I; ++I; continue; } if (!removeDeadUsersOfConstant(User)) { // If the constant wasn't dead, remember that this was the last live use // and move on to the next constant. LastNonDeadUser = I; ++I; continue; } // If the constant was dead, then the iterator is invalidated. if (LastNonDeadUser == E) { I = user_begin(); if (I == E) break; } else { I = LastNonDeadUser; ++I; } } } //===----------------------------------------------------------------------===// // ConstantInt //===----------------------------------------------------------------------===// void ConstantInt::anchor() { } ConstantInt::ConstantInt(IntegerType *Ty, const APInt& V) : Constant(Ty, ConstantIntVal, nullptr, 0), Val(V) { assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type"); } ConstantInt *ConstantInt::getTrue(LLVMContext &Context) { LLVMContextImpl *pImpl = Context.pImpl; if (!pImpl->TheTrueVal) pImpl->TheTrueVal = ConstantInt::get(Type::getInt1Ty(Context), 1); return pImpl->TheTrueVal; } ConstantInt *ConstantInt::getFalse(LLVMContext &Context) { LLVMContextImpl *pImpl = Context.pImpl; if (!pImpl->TheFalseVal) pImpl->TheFalseVal = ConstantInt::get(Type::getInt1Ty(Context), 0); return pImpl->TheFalseVal; } Constant *ConstantInt::getTrue(Type *Ty) { VectorType *VTy = dyn_cast(Ty); if (!VTy) { assert(Ty->isIntegerTy(1) && "True must be i1 or vector of i1."); return ConstantInt::getTrue(Ty->getContext()); } assert(VTy->getElementType()->isIntegerTy(1) && "True must be vector of i1 or i1."); return ConstantVector::getSplat(VTy->getNumElements(), ConstantInt::getTrue(Ty->getContext())); } Constant *ConstantInt::getFalse(Type *Ty) { VectorType *VTy = dyn_cast(Ty); if (!VTy) { assert(Ty->isIntegerTy(1) && "False must be i1 or vector of i1."); return ConstantInt::getFalse(Ty->getContext()); } assert(VTy->getElementType()->isIntegerTy(1) && "False must be vector of i1 or i1."); return ConstantVector::getSplat(VTy->getNumElements(), ConstantInt::getFalse(Ty->getContext())); } // Get a ConstantInt from an APInt. ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) { // get an existing value or the insertion position LLVMContextImpl *pImpl = Context.pImpl; ConstantInt *&Slot = pImpl->IntConstants[V]; if (!Slot) { // Get the corresponding integer type for the bit width of the value. IntegerType *ITy = IntegerType::get(Context, V.getBitWidth()); Slot = new ConstantInt(ITy, V); } assert(Slot->getType() == IntegerType::get(Context, V.getBitWidth())); return Slot; } Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) { Constant *C = get(cast(Ty->getScalarType()), V, isSigned); // For vectors, broadcast the value. if (VectorType *VTy = dyn_cast(Ty)) return ConstantVector::getSplat(VTy->getNumElements(), C); return C; } ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool isSigned) { return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned)); } ConstantInt *ConstantInt::getSigned(IntegerType *Ty, int64_t V) { return get(Ty, V, true); } Constant *ConstantInt::getSigned(Type *Ty, int64_t V) { return get(Ty, V, true); } Constant *ConstantInt::get(Type *Ty, const APInt& V) { ConstantInt *C = get(Ty->getContext(), V); assert(C->getType() == Ty->getScalarType() && "ConstantInt type doesn't match the type implied by its value!"); // For vectors, broadcast the value. if (VectorType *VTy = dyn_cast(Ty)) return ConstantVector::getSplat(VTy->getNumElements(), C); return C; } ConstantInt *ConstantInt::get(IntegerType* Ty, StringRef Str, uint8_t radix) { return get(Ty->getContext(), APInt(Ty->getBitWidth(), Str, radix)); } //===----------------------------------------------------------------------===// // ConstantFP //===----------------------------------------------------------------------===// static const fltSemantics *TypeToFloatSemantics(Type *Ty) { if (Ty->isHalfTy()) return &APFloat::IEEEhalf; if (Ty->isFloatTy()) return &APFloat::IEEEsingle; if (Ty->isDoubleTy()) return &APFloat::IEEEdouble; if (Ty->isX86_FP80Ty()) return &APFloat::x87DoubleExtended; else if (Ty->isFP128Ty()) return &APFloat::IEEEquad; assert(Ty->isPPC_FP128Ty() && "Unknown FP format"); return &APFloat::PPCDoubleDouble; } void ConstantFP::anchor() { } /// get() - This returns a constant fp for the specified value in the /// specified type. This should only be used for simple constant values like /// 2.0/1.0 etc, that are known-valid both as double and as the target format. Constant *ConstantFP::get(Type *Ty, double V) { LLVMContext &Context = Ty->getContext(); APFloat FV(V); bool ignored; FV.convert(*TypeToFloatSemantics(Ty->getScalarType()), APFloat::rmNearestTiesToEven, &ignored); Constant *C = get(Context, FV); // For vectors, broadcast the value. if (VectorType *VTy = dyn_cast(Ty)) return ConstantVector::getSplat(VTy->getNumElements(), C); return C; } Constant *ConstantFP::get(Type *Ty, StringRef Str) { LLVMContext &Context = Ty->getContext(); APFloat FV(*TypeToFloatSemantics(Ty->getScalarType()), Str); Constant *C = get(Context, FV); // For vectors, broadcast the value. if (VectorType *VTy = dyn_cast(Ty)) return ConstantVector::getSplat(VTy->getNumElements(), C); return C; } Constant *ConstantFP::getNegativeZero(Type *Ty) { const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType()); APFloat NegZero = APFloat::getZero(Semantics, /*Negative=*/true); Constant *C = get(Ty->getContext(), NegZero); if (VectorType *VTy = dyn_cast(Ty)) return ConstantVector::getSplat(VTy->getNumElements(), C); return C; } Constant *ConstantFP::getZeroValueForNegation(Type *Ty) { if (Ty->isFPOrFPVectorTy()) return getNegativeZero(Ty); return Constant::getNullValue(Ty); } // ConstantFP accessors. ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) { LLVMContextImpl* pImpl = Context.pImpl; ConstantFP *&Slot = pImpl->FPConstants[V]; if (!Slot) { Type *Ty; if (&V.getSemantics() == &APFloat::IEEEhalf) Ty = Type::getHalfTy(Context); else if (&V.getSemantics() == &APFloat::IEEEsingle) Ty = Type::getFloatTy(Context); else if (&V.getSemantics() == &APFloat::IEEEdouble) Ty = Type::getDoubleTy(Context); else if (&V.getSemantics() == &APFloat::x87DoubleExtended) Ty = Type::getX86_FP80Ty(Context); else if (&V.getSemantics() == &APFloat::IEEEquad) Ty = Type::getFP128Ty(Context); else { assert(&V.getSemantics() == &APFloat::PPCDoubleDouble && "Unknown FP format"); Ty = Type::getPPC_FP128Ty(Context); } Slot = new ConstantFP(Ty, V); } return Slot; } Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) { const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType()); Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative)); if (VectorType *VTy = dyn_cast(Ty)) return ConstantVector::getSplat(VTy->getNumElements(), C); return C; } ConstantFP::ConstantFP(Type *Ty, const APFloat& V) : Constant(Ty, ConstantFPVal, nullptr, 0), Val(V) { assert(&V.getSemantics() == TypeToFloatSemantics(Ty) && "FP type Mismatch"); } bool ConstantFP::isExactlyValue(const APFloat &V) const { return Val.bitwiseIsEqual(V); } //===----------------------------------------------------------------------===// // ConstantAggregateZero Implementation //===----------------------------------------------------------------------===// /// getSequentialElement - If this CAZ has array or vector type, return a zero /// with the right element type. Constant *ConstantAggregateZero::getSequentialElement() const { return Constant::getNullValue(getType()->getSequentialElementType()); } /// getStructElement - If this CAZ has struct type, return a zero with the /// right element type for the specified element. Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const { return Constant::getNullValue(getType()->getStructElementType(Elt)); } /// getElementValue - Return a zero of the right value for the specified GEP /// index if we can, otherwise return null (e.g. if C is a ConstantExpr). Constant *ConstantAggregateZero::getElementValue(Constant *C) const { if (isa(getType())) return getSequentialElement(); return getStructElement(cast(C)->getZExtValue()); } /// getElementValue - Return a zero of the right value for the specified GEP /// index. Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const { if (isa(getType())) return getSequentialElement(); return getStructElement(Idx); } +unsigned ConstantAggregateZero::getNumElements() const { + const Type *Ty = getType(); + if (const auto *AT = dyn_cast(Ty)) + return AT->getNumElements(); + if (const auto *VT = dyn_cast(Ty)) + return VT->getNumElements(); + return Ty->getStructNumElements(); +} //===----------------------------------------------------------------------===// // UndefValue Implementation //===----------------------------------------------------------------------===// /// getSequentialElement - If this undef has array or vector type, return an /// undef with the right element type. UndefValue *UndefValue::getSequentialElement() const { return UndefValue::get(getType()->getSequentialElementType()); } /// getStructElement - If this undef has struct type, return a zero with the /// right element type for the specified element. UndefValue *UndefValue::getStructElement(unsigned Elt) const { return UndefValue::get(getType()->getStructElementType(Elt)); } /// getElementValue - Return an undef of the right value for the specified GEP /// index if we can, otherwise return null (e.g. if C is a ConstantExpr). UndefValue *UndefValue::getElementValue(Constant *C) const { if (isa(getType())) return getSequentialElement(); return getStructElement(cast(C)->getZExtValue()); } /// getElementValue - Return an undef of the right value for the specified GEP /// index. UndefValue *UndefValue::getElementValue(unsigned Idx) const { if (isa(getType())) return getSequentialElement(); return getStructElement(Idx); } - +unsigned UndefValue::getNumElements() const { + const Type *Ty = getType(); + if (const auto *AT = dyn_cast(Ty)) + return AT->getNumElements(); + if (const auto *VT = dyn_cast(Ty)) + return VT->getNumElements(); + return Ty->getStructNumElements(); +} //===----------------------------------------------------------------------===// // ConstantXXX Classes //===----------------------------------------------------------------------===// template static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) { for (; Start != End; ++Start) if (*Start != Elt) return false; return true; } ConstantArray::ConstantArray(ArrayType *T, ArrayRef V) : Constant(T, ConstantArrayVal, OperandTraits::op_end(this) - V.size(), V.size()) { assert(V.size() == T->getNumElements() && "Invalid initializer vector for constant array"); for (unsigned i = 0, e = V.size(); i != e; ++i) assert(V[i]->getType() == T->getElementType() && "Initializer for array element doesn't match array element type!"); std::copy(V.begin(), V.end(), op_begin()); } Constant *ConstantArray::get(ArrayType *Ty, ArrayRef V) { if (Constant *C = getImpl(Ty, V)) return C; return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V); } Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef V) { // Empty arrays are canonicalized to ConstantAggregateZero. if (V.empty()) return ConstantAggregateZero::get(Ty); for (unsigned i = 0, e = V.size(); i != e; ++i) { assert(V[i]->getType() == Ty->getElementType() && "Wrong type in array element initializer"); } // If this is an all-zero array, return a ConstantAggregateZero object. If // all undef, return an UndefValue, if "all simple", then return a // ConstantDataArray. Constant *C = V[0]; if (isa(C) && rangeOnlyContains(V.begin(), V.end(), C)) return UndefValue::get(Ty); if (C->isNullValue() && rangeOnlyContains(V.begin(), V.end(), C)) return ConstantAggregateZero::get(Ty); // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { // We speculatively build the elements here even if it turns out that there // is a constantexpr or something else weird in the array, since it is so // uncommon for that to happen. if (ConstantInt *CI = dyn_cast(C)) { if (CI->getType()->isIntegerTy(8)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataArray::get(C->getContext(), Elts); } else if (CI->getType()->isIntegerTy(16)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataArray::get(C->getContext(), Elts); } else if (CI->getType()->isIntegerTy(32)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataArray::get(C->getContext(), Elts); } else if (CI->getType()->isIntegerTy(64)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataArray::get(C->getContext(), Elts); } } if (ConstantFP *CFP = dyn_cast(C)) { if (CFP->getType()->isFloatTy()) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantFP *CFP = dyn_cast(V[i])) Elts.push_back(CFP->getValueAPF().convertToFloat()); else break; if (Elts.size() == V.size()) return ConstantDataArray::get(C->getContext(), Elts); } else if (CFP->getType()->isDoubleTy()) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantFP *CFP = dyn_cast(V[i])) Elts.push_back(CFP->getValueAPF().convertToDouble()); else break; if (Elts.size() == V.size()) return ConstantDataArray::get(C->getContext(), Elts); } } } // Otherwise, we really do want to create a ConstantArray. return nullptr; } /// getTypeForElements - Return an anonymous struct type to use for a constant /// with the specified set of elements. The list must not be empty. StructType *ConstantStruct::getTypeForElements(LLVMContext &Context, ArrayRef V, bool Packed) { unsigned VecSize = V.size(); SmallVector EltTypes(VecSize); for (unsigned i = 0; i != VecSize; ++i) EltTypes[i] = V[i]->getType(); return StructType::get(Context, EltTypes, Packed); } StructType *ConstantStruct::getTypeForElements(ArrayRef V, bool Packed) { assert(!V.empty() && "ConstantStruct::getTypeForElements cannot be called on empty list"); return getTypeForElements(V[0]->getContext(), V, Packed); } ConstantStruct::ConstantStruct(StructType *T, ArrayRef V) : Constant(T, ConstantStructVal, OperandTraits::op_end(this) - V.size(), V.size()) { assert(V.size() == T->getNumElements() && "Invalid initializer vector for constant structure"); for (unsigned i = 0, e = V.size(); i != e; ++i) assert((T->isOpaque() || V[i]->getType() == T->getElementType(i)) && "Initializer for struct element doesn't match struct element type!"); std::copy(V.begin(), V.end(), op_begin()); } // ConstantStruct accessors. Constant *ConstantStruct::get(StructType *ST, ArrayRef V) { assert((ST->isOpaque() || ST->getNumElements() == V.size()) && "Incorrect # elements specified to ConstantStruct::get"); // Create a ConstantAggregateZero value if all elements are zeros. bool isZero = true; bool isUndef = false; if (!V.empty()) { isUndef = isa(V[0]); isZero = V[0]->isNullValue(); if (isUndef || isZero) { for (unsigned i = 0, e = V.size(); i != e; ++i) { if (!V[i]->isNullValue()) isZero = false; if (!isa(V[i])) isUndef = false; } } } if (isZero) return ConstantAggregateZero::get(ST); if (isUndef) return UndefValue::get(ST); return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V); } Constant *ConstantStruct::get(StructType *T, ...) { va_list ap; SmallVector Values; va_start(ap, T); while (Constant *Val = va_arg(ap, llvm::Constant*)) Values.push_back(Val); va_end(ap); return get(T, Values); } ConstantVector::ConstantVector(VectorType *T, ArrayRef V) : Constant(T, ConstantVectorVal, OperandTraits::op_end(this) - V.size(), V.size()) { for (size_t i = 0, e = V.size(); i != e; i++) assert(V[i]->getType() == T->getElementType() && "Initializer for vector element doesn't match vector element type!"); std::copy(V.begin(), V.end(), op_begin()); } // ConstantVector accessors. Constant *ConstantVector::get(ArrayRef V) { if (Constant *C = getImpl(V)) return C; VectorType *Ty = VectorType::get(V.front()->getType(), V.size()); return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V); } Constant *ConstantVector::getImpl(ArrayRef V) { assert(!V.empty() && "Vectors can't be empty"); VectorType *T = VectorType::get(V.front()->getType(), V.size()); // If this is an all-undef or all-zero vector, return a // ConstantAggregateZero or UndefValue. Constant *C = V[0]; bool isZero = C->isNullValue(); bool isUndef = isa(C); if (isZero || isUndef) { for (unsigned i = 1, e = V.size(); i != e; ++i) if (V[i] != C) { isZero = isUndef = false; break; } } if (isZero) return ConstantAggregateZero::get(T); if (isUndef) return UndefValue::get(T); // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { // We speculatively build the elements here even if it turns out that there // is a constantexpr or something else weird in the array, since it is so // uncommon for that to happen. if (ConstantInt *CI = dyn_cast(C)) { if (CI->getType()->isIntegerTy(8)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataVector::get(C->getContext(), Elts); } else if (CI->getType()->isIntegerTy(16)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataVector::get(C->getContext(), Elts); } else if (CI->getType()->isIntegerTy(32)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataVector::get(C->getContext(), Elts); } else if (CI->getType()->isIntegerTy(64)) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantInt *CI = dyn_cast(V[i])) Elts.push_back(CI->getZExtValue()); else break; if (Elts.size() == V.size()) return ConstantDataVector::get(C->getContext(), Elts); } } if (ConstantFP *CFP = dyn_cast(C)) { if (CFP->getType()->isFloatTy()) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantFP *CFP = dyn_cast(V[i])) Elts.push_back(CFP->getValueAPF().convertToFloat()); else break; if (Elts.size() == V.size()) return ConstantDataVector::get(C->getContext(), Elts); } else if (CFP->getType()->isDoubleTy()) { SmallVector Elts; for (unsigned i = 0, e = V.size(); i != e; ++i) if (ConstantFP *CFP = dyn_cast(V[i])) Elts.push_back(CFP->getValueAPF().convertToDouble()); else break; if (Elts.size() == V.size()) return ConstantDataVector::get(C->getContext(), Elts); } } } // Otherwise, the element type isn't compatible with ConstantDataVector, or // the operand list constants a ConstantExpr or something else strange. return nullptr; } Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) { // If this splat is compatible with ConstantDataVector, use it instead of // ConstantVector. if ((isa(V) || isa(V)) && ConstantDataSequential::isElementTypeCompatible(V->getType())) return ConstantDataVector::getSplat(NumElts, V); SmallVector Elts(NumElts, V); return get(Elts); } // Utility function for determining if a ConstantExpr is a CastOp or not. This // can't be inline because we don't want to #include Instruction.h into // Constant.h bool ConstantExpr::isCast() const { return Instruction::isCast(getOpcode()); } bool ConstantExpr::isCompare() const { return getOpcode() == Instruction::ICmp || getOpcode() == Instruction::FCmp; } bool ConstantExpr::isGEPWithNoNotionalOverIndexing() const { if (getOpcode() != Instruction::GetElementPtr) return false; gep_type_iterator GEPI = gep_type_begin(this), E = gep_type_end(this); User::const_op_iterator OI = std::next(this->op_begin()); // Skip the first index, as it has no static limit. ++GEPI; ++OI; // The remaining indices must be compile-time known integers within the // bounds of the corresponding notional static array types. for (; GEPI != E; ++GEPI, ++OI) { ConstantInt *CI = dyn_cast(*OI); if (!CI) return false; if (ArrayType *ATy = dyn_cast(*GEPI)) if (CI->getValue().getActiveBits() > 64 || CI->getZExtValue() >= ATy->getNumElements()) return false; } // All the indices checked out. return true; } bool ConstantExpr::hasIndices() const { return getOpcode() == Instruction::ExtractValue || getOpcode() == Instruction::InsertValue; } ArrayRef ConstantExpr::getIndices() const { if (const ExtractValueConstantExpr *EVCE = dyn_cast(this)) return EVCE->Indices; return cast(this)->Indices; } unsigned ConstantExpr::getPredicate() const { assert(isCompare()); return ((const CompareConstantExpr*)this)->predicate; } /// getWithOperandReplaced - Return a constant expression identical to this /// one, but with the specified operand set to the specified value. Constant * ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const { assert(Op->getType() == getOperand(OpNo)->getType() && "Replacing operand with value of different type!"); if (getOperand(OpNo) == Op) return const_cast(this); SmallVector NewOps; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) NewOps.push_back(i == OpNo ? Op : getOperand(i)); return getWithOperands(NewOps); } /// getWithOperands - This returns the current constant expression with the /// operands replaced with the specified values. The specified array must /// have the same number of operands as our current one. Constant *ConstantExpr::getWithOperands(ArrayRef Ops, Type *Ty, bool OnlyIfReduced) const { assert(Ops.size() == getNumOperands() && "Operand count mismatch!"); bool AnyChange = Ty != getType(); for (unsigned i = 0; i != Ops.size(); ++i) AnyChange |= Ops[i] != getOperand(i); if (!AnyChange) // No operands changed, return self. return const_cast(this); Type *OnlyIfReducedTy = OnlyIfReduced ? Ty : nullptr; switch (getOpcode()) { case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::UIToFP: case Instruction::SIToFP: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: case Instruction::AddrSpaceCast: return ConstantExpr::getCast(getOpcode(), Ops[0], Ty, OnlyIfReduced); case Instruction::Select: return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2], OnlyIfReducedTy); case Instruction::InsertElement: return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2], OnlyIfReducedTy); case Instruction::ExtractElement: return ConstantExpr::getExtractElement(Ops[0], Ops[1], OnlyIfReducedTy); case Instruction::InsertValue: return ConstantExpr::getInsertValue(Ops[0], Ops[1], getIndices(), OnlyIfReducedTy); case Instruction::ExtractValue: return ConstantExpr::getExtractValue(Ops[0], getIndices(), OnlyIfReducedTy); case Instruction::ShuffleVector: return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2], OnlyIfReducedTy); case Instruction::GetElementPtr: return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1), cast(this)->isInBounds(), OnlyIfReducedTy); case Instruction::ICmp: case Instruction::FCmp: return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1], OnlyIfReducedTy); default: assert(getNumOperands() == 2 && "Must be binary operator?"); return ConstantExpr::get(getOpcode(), Ops[0], Ops[1], SubclassOptionalData, OnlyIfReducedTy); } } //===----------------------------------------------------------------------===// // isValueValidForType implementations bool ConstantInt::isValueValidForType(Type *Ty, uint64_t Val) { unsigned NumBits = Ty->getIntegerBitWidth(); // assert okay if (Ty->isIntegerTy(1)) return Val == 0 || Val == 1; if (NumBits >= 64) return true; // always true, has to fit in largest type uint64_t Max = (1ll << NumBits) - 1; return Val <= Max; } bool ConstantInt::isValueValidForType(Type *Ty, int64_t Val) { unsigned NumBits = Ty->getIntegerBitWidth(); if (Ty->isIntegerTy(1)) return Val == 0 || Val == 1 || Val == -1; if (NumBits >= 64) return true; // always true, has to fit in largest type int64_t Min = -(1ll << (NumBits-1)); int64_t Max = (1ll << (NumBits-1)) - 1; return (Val >= Min && Val <= Max); } bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) { // convert modifies in place, so make a copy. APFloat Val2 = APFloat(Val); bool losesInfo; switch (Ty->getTypeID()) { default: return false; // These can't be represented as floating point! // FIXME rounding mode needs to be more flexible case Type::HalfTyID: { if (&Val2.getSemantics() == &APFloat::IEEEhalf) return true; Val2.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } case Type::FloatTyID: { if (&Val2.getSemantics() == &APFloat::IEEEsingle) return true; Val2.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } case Type::DoubleTyID: { if (&Val2.getSemantics() == &APFloat::IEEEhalf || &Val2.getSemantics() == &APFloat::IEEEsingle || &Val2.getSemantics() == &APFloat::IEEEdouble) return true; Val2.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } case Type::X86_FP80TyID: return &Val2.getSemantics() == &APFloat::IEEEhalf || &Val2.getSemantics() == &APFloat::IEEEsingle || &Val2.getSemantics() == &APFloat::IEEEdouble || &Val2.getSemantics() == &APFloat::x87DoubleExtended; case Type::FP128TyID: return &Val2.getSemantics() == &APFloat::IEEEhalf || &Val2.getSemantics() == &APFloat::IEEEsingle || &Val2.getSemantics() == &APFloat::IEEEdouble || &Val2.getSemantics() == &APFloat::IEEEquad; case Type::PPC_FP128TyID: return &Val2.getSemantics() == &APFloat::IEEEhalf || &Val2.getSemantics() == &APFloat::IEEEsingle || &Val2.getSemantics() == &APFloat::IEEEdouble || &Val2.getSemantics() == &APFloat::PPCDoubleDouble; } } //===----------------------------------------------------------------------===// // Factory Function Implementation ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) { assert((Ty->isStructTy() || Ty->isArrayTy() || Ty->isVectorTy()) && "Cannot create an aggregate zero of non-aggregate type!"); ConstantAggregateZero *&Entry = Ty->getContext().pImpl->CAZConstants[Ty]; if (!Entry) Entry = new ConstantAggregateZero(Ty); return Entry; } /// destroyConstant - Remove the constant from the constant table. /// void ConstantAggregateZero::destroyConstant() { getContext().pImpl->CAZConstants.erase(getType()); destroyConstantImpl(); } /// destroyConstant - Remove the constant from the constant table... /// void ConstantArray::destroyConstant() { getType()->getContext().pImpl->ArrayConstants.remove(this); destroyConstantImpl(); } //---- ConstantStruct::get() implementation... // // destroyConstant - Remove the constant from the constant table... // void ConstantStruct::destroyConstant() { getType()->getContext().pImpl->StructConstants.remove(this); destroyConstantImpl(); } // destroyConstant - Remove the constant from the constant table... // void ConstantVector::destroyConstant() { getType()->getContext().pImpl->VectorConstants.remove(this); destroyConstantImpl(); } /// getSplatValue - If this is a splat vector constant, meaning that all of /// the elements have the same value, return that value. Otherwise return 0. Constant *Constant::getSplatValue() const { assert(this->getType()->isVectorTy() && "Only valid for vectors!"); if (isa(this)) return getNullValue(this->getType()->getVectorElementType()); if (const ConstantDataVector *CV = dyn_cast(this)) return CV->getSplatValue(); if (const ConstantVector *CV = dyn_cast(this)) return CV->getSplatValue(); return nullptr; } /// getSplatValue - If this is a splat constant, where all of the /// elements have the same value, return that value. Otherwise return null. Constant *ConstantVector::getSplatValue() const { // Check out first element. Constant *Elt = getOperand(0); // Then make sure all remaining elements point to the same value. for (unsigned I = 1, E = getNumOperands(); I < E; ++I) if (getOperand(I) != Elt) return nullptr; return Elt; } /// If C is a constant integer then return its value, otherwise C must be a /// vector of constant integers, all equal, and the common value is returned. const APInt &Constant::getUniqueInteger() const { if (const ConstantInt *CI = dyn_cast(this)) return CI->getValue(); assert(this->getSplatValue() && "Doesn't contain a unique integer!"); const Constant *C = this->getAggregateElement(0U); assert(C && isa(C) && "Not a vector of numbers!"); return cast(C)->getValue(); } //---- ConstantPointerNull::get() implementation. // ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) { ConstantPointerNull *&Entry = Ty->getContext().pImpl->CPNConstants[Ty]; if (!Entry) Entry = new ConstantPointerNull(Ty); return Entry; } // destroyConstant - Remove the constant from the constant table... // void ConstantPointerNull::destroyConstant() { getContext().pImpl->CPNConstants.erase(getType()); // Free the constant and any dangling references to it. destroyConstantImpl(); } //---- UndefValue::get() implementation. // UndefValue *UndefValue::get(Type *Ty) { UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty]; if (!Entry) Entry = new UndefValue(Ty); return Entry; } // destroyConstant - Remove the constant from the constant table. // void UndefValue::destroyConstant() { // Free the constant and any dangling references to it. getContext().pImpl->UVConstants.erase(getType()); destroyConstantImpl(); } //---- BlockAddress::get() implementation. // BlockAddress *BlockAddress::get(BasicBlock *BB) { assert(BB->getParent() && "Block must have a parent"); return get(BB->getParent(), BB); } BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) { BlockAddress *&BA = F->getContext().pImpl->BlockAddresses[std::make_pair(F, BB)]; if (!BA) BA = new BlockAddress(F, BB); assert(BA->getFunction() == F && "Basic block moved between functions"); return BA; } BlockAddress::BlockAddress(Function *F, BasicBlock *BB) : Constant(Type::getInt8PtrTy(F->getContext()), Value::BlockAddressVal, &Op<0>(), 2) { setOperand(0, F); setOperand(1, BB); BB->AdjustBlockAddressRefCount(1); } BlockAddress *BlockAddress::lookup(const BasicBlock *BB) { if (!BB->hasAddressTaken()) return nullptr; const Function *F = BB->getParent(); assert(F && "Block must have a parent"); BlockAddress *BA = F->getContext().pImpl->BlockAddresses.lookup(std::make_pair(F, BB)); assert(BA && "Refcount and block address map disagree!"); return BA; } // destroyConstant - Remove the constant from the constant table. // void BlockAddress::destroyConstant() { getFunction()->getType()->getContext().pImpl ->BlockAddresses.erase(std::make_pair(getFunction(), getBasicBlock())); getBasicBlock()->AdjustBlockAddressRefCount(-1); destroyConstantImpl(); } void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { // This could be replacing either the Basic Block or the Function. In either // case, we have to remove the map entry. Function *NewF = getFunction(); BasicBlock *NewBB = getBasicBlock(); if (U == &Op<0>()) NewF = cast(To->stripPointerCasts()); else NewBB = cast(To); // See if the 'new' entry already exists, if not, just update this in place // and return early. BlockAddress *&NewBA = getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)]; if (NewBA) { replaceUsesOfWithOnConstantImpl(NewBA); return; } getBasicBlock()->AdjustBlockAddressRefCount(-1); // Remove the old entry, this can't cause the map to rehash (just a // tombstone will get added). getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(), getBasicBlock())); NewBA = this; setOperand(0, NewF); setOperand(1, NewBB); getBasicBlock()->AdjustBlockAddressRefCount(1); } //---- ConstantExpr::get() implementations. // /// This is a utility function to handle folding of casts and lookup of the /// cast in the ExprConstants map. It is used by the various get* methods below. static Constant *getFoldedCast(Instruction::CastOps opc, Constant *C, Type *Ty, bool OnlyIfReduced = false) { assert(Ty->isFirstClassType() && "Cannot cast to an aggregate type!"); // Fold a few common cases if (Constant *FC = ConstantFoldCastInstruction(opc, C, Ty)) return FC; if (OnlyIfReduced) return nullptr; LLVMContextImpl *pImpl = Ty->getContext().pImpl; // Look up the constant in the table first to ensure uniqueness. ConstantExprKeyType Key(opc, C); return pImpl->ExprConstants.getOrCreate(Ty, Key); } Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty, bool OnlyIfReduced) { Instruction::CastOps opc = Instruction::CastOps(oc); assert(Instruction::isCast(opc) && "opcode out of range"); assert(C && Ty && "Null arguments to getCast"); assert(CastInst::castIsValid(opc, C, Ty) && "Invalid constantexpr cast!"); switch (opc) { default: llvm_unreachable("Invalid cast opcode"); case Instruction::Trunc: return getTrunc(C, Ty, OnlyIfReduced); case Instruction::ZExt: return getZExt(C, Ty, OnlyIfReduced); case Instruction::SExt: return getSExt(C, Ty, OnlyIfReduced); case Instruction::FPTrunc: return getFPTrunc(C, Ty, OnlyIfReduced); case Instruction::FPExt: return getFPExtend(C, Ty, OnlyIfReduced); case Instruction::UIToFP: return getUIToFP(C, Ty, OnlyIfReduced); case Instruction::SIToFP: return getSIToFP(C, Ty, OnlyIfReduced); case Instruction::FPToUI: return getFPToUI(C, Ty, OnlyIfReduced); case Instruction::FPToSI: return getFPToSI(C, Ty, OnlyIfReduced); case Instruction::PtrToInt: return getPtrToInt(C, Ty, OnlyIfReduced); case Instruction::IntToPtr: return getIntToPtr(C, Ty, OnlyIfReduced); case Instruction::BitCast: return getBitCast(C, Ty, OnlyIfReduced); case Instruction::AddrSpaceCast: return getAddrSpaceCast(C, Ty, OnlyIfReduced); } } Constant *ConstantExpr::getZExtOrBitCast(Constant *C, Type *Ty) { if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) return getBitCast(C, Ty); return getZExt(C, Ty); } Constant *ConstantExpr::getSExtOrBitCast(Constant *C, Type *Ty) { if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) return getBitCast(C, Ty); return getSExt(C, Ty); } Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) { if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) return getBitCast(C, Ty); return getTrunc(C, Ty); } Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) { assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast"); assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) && "Invalid cast"); if (Ty->isIntOrIntVectorTy()) return getPtrToInt(S, Ty); unsigned SrcAS = S->getType()->getPointerAddressSpace(); if (Ty->isPtrOrPtrVectorTy() && SrcAS != Ty->getPointerAddressSpace()) return getAddrSpaceCast(S, Ty); return getBitCast(S, Ty); } Constant *ConstantExpr::getPointerBitCastOrAddrSpaceCast(Constant *S, Type *Ty) { assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast"); assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast"); if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace()) return getAddrSpaceCast(S, Ty); return getBitCast(S, Ty); } Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, bool isSigned) { assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() && "Invalid cast"); unsigned SrcBits = C->getType()->getScalarSizeInBits(); unsigned DstBits = Ty->getScalarSizeInBits(); Instruction::CastOps opcode = (SrcBits == DstBits ? Instruction::BitCast : (SrcBits > DstBits ? Instruction::Trunc : (isSigned ? Instruction::SExt : Instruction::ZExt))); return getCast(opcode, C, Ty); } Constant *ConstantExpr::getFPCast(Constant *C, Type *Ty) { assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() && "Invalid cast"); unsigned SrcBits = C->getType()->getScalarSizeInBits(); unsigned DstBits = Ty->getScalarSizeInBits(); if (SrcBits == DstBits) return C; // Avoid a useless cast Instruction::CastOps opcode = (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt); return getCast(opcode, C, Ty); } Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isIntOrIntVectorTy() && "Trunc operand must be integer"); assert(Ty->isIntOrIntVectorTy() && "Trunc produces only integral"); assert(C->getType()->getScalarSizeInBits() > Ty->getScalarSizeInBits()&& "SrcTy must be larger than DestTy for Trunc!"); return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getSExt(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isIntOrIntVectorTy() && "SExt operand must be integral"); assert(Ty->isIntOrIntVectorTy() && "SExt produces only integer"); assert(C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&& "SrcTy must be smaller than DestTy for SExt!"); return getFoldedCast(Instruction::SExt, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getZExt(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isIntOrIntVectorTy() && "ZEXt operand must be integral"); assert(Ty->isIntOrIntVectorTy() && "ZExt produces only integer"); assert(C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&& "SrcTy must be smaller than DestTy for ZExt!"); return getFoldedCast(Instruction::ZExt, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() && C->getType()->getScalarSizeInBits() > Ty->getScalarSizeInBits()&& "This is an illegal floating point truncation!"); return getFoldedCast(Instruction::FPTrunc, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() && C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&& "This is an illegal floating point extension!"); return getFoldedCast(Instruction::FPExt, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() && "This is an illegal uint to floating point cast!"); return getFoldedCast(Instruction::UIToFP, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() && "This is an illegal sint to floating point cast!"); return getFoldedCast(Instruction::SIToFP, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() && "This is an illegal floating point to uint cast!"); return getFoldedCast(Instruction::FPToUI, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty, bool OnlyIfReduced) { #ifndef NDEBUG bool fromVec = C->getType()->getTypeID() == Type::VectorTyID; bool toVec = Ty->getTypeID() == Type::VectorTyID; #endif assert((fromVec == toVec) && "Cannot convert from scalar to/from vector"); assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() && "This is an illegal floating point to sint cast!"); return getFoldedCast(Instruction::FPToSI, C, Ty, OnlyIfReduced); } Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(C->getType()->getScalarType()->isPointerTy() && "PtrToInt source must be pointer or pointer vector"); assert(DstTy->getScalarType()->isIntegerTy() && "PtrToInt destination must be integer or integer vector"); assert(isa(C->getType()) == isa(DstTy)); if (isa(C->getType())) assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&& "Invalid cast between a different number of vector elements"); return getFoldedCast(Instruction::PtrToInt, C, DstTy, OnlyIfReduced); } Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(C->getType()->getScalarType()->isIntegerTy() && "IntToPtr source must be integer or integer vector"); assert(DstTy->getScalarType()->isPointerTy() && "IntToPtr destination must be a pointer or pointer vector"); assert(isa(C->getType()) == isa(DstTy)); if (isa(C->getType())) assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&& "Invalid cast between a different number of vector elements"); return getFoldedCast(Instruction::IntToPtr, C, DstTy, OnlyIfReduced); } Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) && "Invalid constantexpr bitcast!"); // It is common to ask for a bitcast of a value to its own type, handle this // speedily. if (C->getType() == DstTy) return C; return getFoldedCast(Instruction::BitCast, C, DstTy, OnlyIfReduced); } Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) && "Invalid constantexpr addrspacecast!"); // Canonicalize addrspacecasts between different pointer types by first // bitcasting the pointer type and then converting the address space. PointerType *SrcScalarTy = cast(C->getType()->getScalarType()); PointerType *DstScalarTy = cast(DstTy->getScalarType()); Type *DstElemTy = DstScalarTy->getElementType(); if (SrcScalarTy->getElementType() != DstElemTy) { Type *MidTy = PointerType::get(DstElemTy, SrcScalarTy->getAddressSpace()); if (VectorType *VT = dyn_cast(DstTy)) { // Handle vectors of pointers. MidTy = VectorType::get(MidTy, VT->getNumElements()); } C = getBitCast(C, MidTy); } return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced); } Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2, unsigned Flags, Type *OnlyIfReducedTy) { // Check the operands for consistency first. assert(Opcode >= Instruction::BinaryOpsBegin && Opcode < Instruction::BinaryOpsEnd && "Invalid opcode in binary constant expression"); assert(C1->getType() == C2->getType() && "Operand types in binary constant expression should match"); #ifndef NDEBUG switch (Opcode) { case Instruction::Add: case Instruction::Sub: case Instruction::Mul: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isIntOrIntVectorTy() && "Tried to create an integer operation on a non-integer type!"); break; case Instruction::FAdd: case Instruction::FSub: case Instruction::FMul: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isFPOrFPVectorTy() && "Tried to create a floating-point operation on a " "non-floating-point type!"); break; case Instruction::UDiv: case Instruction::SDiv: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isIntOrIntVectorTy() && "Tried to create an arithmetic operation on a non-arithmetic type!"); break; case Instruction::FDiv: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isFPOrFPVectorTy() && "Tried to create an arithmetic operation on a non-arithmetic type!"); break; case Instruction::URem: case Instruction::SRem: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isIntOrIntVectorTy() && "Tried to create an arithmetic operation on a non-arithmetic type!"); break; case Instruction::FRem: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isFPOrFPVectorTy() && "Tried to create an arithmetic operation on a non-arithmetic type!"); break; case Instruction::And: case Instruction::Or: case Instruction::Xor: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isIntOrIntVectorTy() && "Tried to create a logical operation on a non-integral type!"); break; case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: assert(C1->getType() == C2->getType() && "Op types should be identical!"); assert(C1->getType()->isIntOrIntVectorTy() && "Tried to create a shift operation on a non-integer type!"); break; default: break; } #endif if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2)) return FC; // Fold a few common cases. if (OnlyIfReducedTy == C1->getType()) return nullptr; Constant *ArgVec[] = { C1, C2 }; ConstantExprKeyType Key(Opcode, ArgVec, 0, Flags); LLVMContextImpl *pImpl = C1->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(C1->getType(), Key); } Constant *ConstantExpr::getSizeOf(Type* Ty) { // sizeof is implemented as: (i64) gep (Ty*)null, 1 // Note that a non-inbounds gep is used, as null isn't within any object. Constant *GEPIdx = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1); Constant *GEP = getGetElementPtr( Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx); return getPtrToInt(GEP, Type::getInt64Ty(Ty->getContext())); } Constant *ConstantExpr::getAlignOf(Type* Ty) { // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1 // Note that a non-inbounds gep is used, as null isn't within any object. Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr); Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0)); Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0); Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1); Constant *Indices[2] = { Zero, One }; Constant *GEP = getGetElementPtr(NullPtr, Indices); return getPtrToInt(GEP, Type::getInt64Ty(Ty->getContext())); } Constant *ConstantExpr::getOffsetOf(StructType* STy, unsigned FieldNo) { return getOffsetOf(STy, ConstantInt::get(Type::getInt32Ty(STy->getContext()), FieldNo)); } Constant *ConstantExpr::getOffsetOf(Type* Ty, Constant *FieldNo) { // offsetof is implemented as: (i64) gep (Ty*)null, 0, FieldNo // Note that a non-inbounds gep is used, as null isn't within any object. Constant *GEPIdx[] = { ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0), FieldNo }; Constant *GEP = getGetElementPtr( Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx); return getPtrToInt(GEP, Type::getInt64Ty(Ty->getContext())); } Constant *ConstantExpr::getCompare(unsigned short Predicate, Constant *C1, Constant *C2, bool OnlyIfReduced) { assert(C1->getType() == C2->getType() && "Op types should be identical!"); switch (Predicate) { default: llvm_unreachable("Invalid CmpInst predicate"); case CmpInst::FCMP_FALSE: case CmpInst::FCMP_OEQ: case CmpInst::FCMP_OGT: case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLT: case CmpInst::FCMP_OLE: case CmpInst::FCMP_ONE: case CmpInst::FCMP_ORD: case CmpInst::FCMP_UNO: case CmpInst::FCMP_UEQ: case CmpInst::FCMP_UGT: case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULT: case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE: case CmpInst::FCMP_TRUE: return getFCmp(Predicate, C1, C2, OnlyIfReduced); case CmpInst::ICMP_EQ: case CmpInst::ICMP_NE: case CmpInst::ICMP_UGT: case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE: case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT: case CmpInst::ICMP_SLE: return getICmp(Predicate, C1, C2, OnlyIfReduced); } } Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2, Type *OnlyIfReducedTy) { assert(!SelectInst::areInvalidOperands(C, V1, V2)&&"Invalid select operands"); if (Constant *SC = ConstantFoldSelectInstruction(C, V1, V2)) return SC; // Fold common cases if (OnlyIfReducedTy == V1->getType()) return nullptr; Constant *ArgVec[] = { C, V1, V2 }; ConstantExprKeyType Key(Instruction::Select, ArgVec); LLVMContextImpl *pImpl = C->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(V1->getType(), Key); } Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef Idxs, bool InBounds, Type *OnlyIfReducedTy) { assert(C->getType()->isPtrOrPtrVectorTy() && "Non-pointer type for constant GetElementPtr expression"); if (Constant *FC = ConstantFoldGetElementPtr(C, InBounds, Idxs)) return FC; // Fold a few common cases. // Get the result type of the getelementptr! Type *Ty = GetElementPtrInst::getIndexedType(C->getType(), Idxs); assert(Ty && "GEP indices invalid!"); unsigned AS = C->getType()->getPointerAddressSpace(); Type *ReqTy = Ty->getPointerTo(AS); if (VectorType *VecTy = dyn_cast(C->getType())) ReqTy = VectorType::get(ReqTy, VecTy->getNumElements()); if (OnlyIfReducedTy == ReqTy) return nullptr; // Look up the constant in the table first to ensure uniqueness std::vector ArgVec; ArgVec.reserve(1 + Idxs.size()); ArgVec.push_back(C); for (unsigned i = 0, e = Idxs.size(); i != e; ++i) { assert(Idxs[i]->getType()->isVectorTy() == ReqTy->isVectorTy() && "getelementptr index type missmatch"); assert((!Idxs[i]->getType()->isVectorTy() || ReqTy->getVectorNumElements() == Idxs[i]->getType()->getVectorNumElements()) && "getelementptr index type missmatch"); ArgVec.push_back(cast(Idxs[i])); } const ConstantExprKeyType Key(Instruction::GetElementPtr, ArgVec, 0, InBounds ? GEPOperator::IsInBounds : 0); LLVMContextImpl *pImpl = C->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ReqTy, Key); } Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced) { assert(LHS->getType() == RHS->getType()); assert(pred >= ICmpInst::FIRST_ICMP_PREDICATE && pred <= ICmpInst::LAST_ICMP_PREDICATE && "Invalid ICmp Predicate"); if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS)) return FC; // Fold a few common cases... if (OnlyIfReduced) return nullptr; // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { LHS, RHS }; // Get the key type with both the opcode and predicate const ConstantExprKeyType Key(Instruction::ICmp, ArgVec, pred); Type *ResultTy = Type::getInt1Ty(LHS->getContext()); if (VectorType *VT = dyn_cast(LHS->getType())) ResultTy = VectorType::get(ResultTy, VT->getNumElements()); LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ResultTy, Key); } Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced) { assert(LHS->getType() == RHS->getType()); assert(pred <= FCmpInst::LAST_FCMP_PREDICATE && "Invalid FCmp Predicate"); if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS)) return FC; // Fold a few common cases... if (OnlyIfReduced) return nullptr; // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { LHS, RHS }; // Get the key type with both the opcode and predicate const ConstantExprKeyType Key(Instruction::FCmp, ArgVec, pred); Type *ResultTy = Type::getInt1Ty(LHS->getContext()); if (VectorType *VT = dyn_cast(LHS->getType())) ResultTy = VectorType::get(ResultTy, VT->getNumElements()); LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ResultTy, Key); } Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx, Type *OnlyIfReducedTy) { assert(Val->getType()->isVectorTy() && "Tried to create extractelement operation on non-vector type!"); assert(Idx->getType()->isIntegerTy() && "Extractelement index must be an integer type!"); if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx)) return FC; // Fold a few common cases. Type *ReqTy = Val->getType()->getVectorElementType(); if (OnlyIfReducedTy == ReqTy) return nullptr; // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { Val, Idx }; const ConstantExprKeyType Key(Instruction::ExtractElement, ArgVec); LLVMContextImpl *pImpl = Val->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ReqTy, Key); } Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt, Constant *Idx, Type *OnlyIfReducedTy) { assert(Val->getType()->isVectorTy() && "Tried to create insertelement operation on non-vector type!"); assert(Elt->getType() == Val->getType()->getVectorElementType() && "Insertelement types must match!"); assert(Idx->getType()->isIntegerTy() && "Insertelement index must be i32 type!"); if (Constant *FC = ConstantFoldInsertElementInstruction(Val, Elt, Idx)) return FC; // Fold a few common cases. if (OnlyIfReducedTy == Val->getType()) return nullptr; // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { Val, Elt, Idx }; const ConstantExprKeyType Key(Instruction::InsertElement, ArgVec); LLVMContextImpl *pImpl = Val->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(Val->getType(), Key); } Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2, Constant *Mask, Type *OnlyIfReducedTy) { assert(ShuffleVectorInst::isValidOperands(V1, V2, Mask) && "Invalid shuffle vector constant expr operands!"); if (Constant *FC = ConstantFoldShuffleVectorInstruction(V1, V2, Mask)) return FC; // Fold a few common cases. unsigned NElts = Mask->getType()->getVectorNumElements(); Type *EltTy = V1->getType()->getVectorElementType(); Type *ShufTy = VectorType::get(EltTy, NElts); if (OnlyIfReducedTy == ShufTy) return nullptr; // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { V1, V2, Mask }; const ConstantExprKeyType Key(Instruction::ShuffleVector, ArgVec); LLVMContextImpl *pImpl = ShufTy->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ShufTy, Key); } Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val, ArrayRef Idxs, Type *OnlyIfReducedTy) { assert(Agg->getType()->isFirstClassType() && "Non-first-class type for constant insertvalue expression"); assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) == Val->getType() && "insertvalue indices invalid!"); Type *ReqTy = Val->getType(); if (Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs)) return FC; if (OnlyIfReducedTy == ReqTy) return nullptr; Constant *ArgVec[] = { Agg, Val }; const ConstantExprKeyType Key(Instruction::InsertValue, ArgVec, 0, 0, Idxs); LLVMContextImpl *pImpl = Agg->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ReqTy, Key); } Constant *ConstantExpr::getExtractValue(Constant *Agg, ArrayRef Idxs, Type *OnlyIfReducedTy) { assert(Agg->getType()->isFirstClassType() && "Tried to create extractelement operation on non-first-class type!"); Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs); (void)ReqTy; assert(ReqTy && "extractvalue indices invalid!"); assert(Agg->getType()->isFirstClassType() && "Non-first-class type for constant extractvalue expression"); if (Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs)) return FC; if (OnlyIfReducedTy == ReqTy) return nullptr; Constant *ArgVec[] = { Agg }; const ConstantExprKeyType Key(Instruction::ExtractValue, ArgVec, 0, 0, Idxs); LLVMContextImpl *pImpl = Agg->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ReqTy, Key); } Constant *ConstantExpr::getNeg(Constant *C, bool HasNUW, bool HasNSW) { assert(C->getType()->isIntOrIntVectorTy() && "Cannot NEG a nonintegral value!"); return getSub(ConstantFP::getZeroValueForNegation(C->getType()), C, HasNUW, HasNSW); } Constant *ConstantExpr::getFNeg(Constant *C) { assert(C->getType()->isFPOrFPVectorTy() && "Cannot FNEG a non-floating-point value!"); return getFSub(ConstantFP::getZeroValueForNegation(C->getType()), C); } Constant *ConstantExpr::getNot(Constant *C) { assert(C->getType()->isIntOrIntVectorTy() && "Cannot NOT a nonintegral value!"); return get(Instruction::Xor, C, Constant::getAllOnesValue(C->getType())); } Constant *ConstantExpr::getAdd(Constant *C1, Constant *C2, bool HasNUW, bool HasNSW) { unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) | (HasNSW ? OverflowingBinaryOperator::NoSignedWrap : 0); return get(Instruction::Add, C1, C2, Flags); } Constant *ConstantExpr::getFAdd(Constant *C1, Constant *C2) { return get(Instruction::FAdd, C1, C2); } Constant *ConstantExpr::getSub(Constant *C1, Constant *C2, bool HasNUW, bool HasNSW) { unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) | (HasNSW ? OverflowingBinaryOperator::NoSignedWrap : 0); return get(Instruction::Sub, C1, C2, Flags); } Constant *ConstantExpr::getFSub(Constant *C1, Constant *C2) { return get(Instruction::FSub, C1, C2); } Constant *ConstantExpr::getMul(Constant *C1, Constant *C2, bool HasNUW, bool HasNSW) { unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) | (HasNSW ? OverflowingBinaryOperator::NoSignedWrap : 0); return get(Instruction::Mul, C1, C2, Flags); } Constant *ConstantExpr::getFMul(Constant *C1, Constant *C2) { return get(Instruction::FMul, C1, C2); } Constant *ConstantExpr::getUDiv(Constant *C1, Constant *C2, bool isExact) { return get(Instruction::UDiv, C1, C2, isExact ? PossiblyExactOperator::IsExact : 0); } Constant *ConstantExpr::getSDiv(Constant *C1, Constant *C2, bool isExact) { return get(Instruction::SDiv, C1, C2, isExact ? PossiblyExactOperator::IsExact : 0); } Constant *ConstantExpr::getFDiv(Constant *C1, Constant *C2) { return get(Instruction::FDiv, C1, C2); } Constant *ConstantExpr::getURem(Constant *C1, Constant *C2) { return get(Instruction::URem, C1, C2); } Constant *ConstantExpr::getSRem(Constant *C1, Constant *C2) { return get(Instruction::SRem, C1, C2); } Constant *ConstantExpr::getFRem(Constant *C1, Constant *C2) { return get(Instruction::FRem, C1, C2); } Constant *ConstantExpr::getAnd(Constant *C1, Constant *C2) { return get(Instruction::And, C1, C2); } Constant *ConstantExpr::getOr(Constant *C1, Constant *C2) { return get(Instruction::Or, C1, C2); } Constant *ConstantExpr::getXor(Constant *C1, Constant *C2) { return get(Instruction::Xor, C1, C2); } Constant *ConstantExpr::getShl(Constant *C1, Constant *C2, bool HasNUW, bool HasNSW) { unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) | (HasNSW ? OverflowingBinaryOperator::NoSignedWrap : 0); return get(Instruction::Shl, C1, C2, Flags); } Constant *ConstantExpr::getLShr(Constant *C1, Constant *C2, bool isExact) { return get(Instruction::LShr, C1, C2, isExact ? PossiblyExactOperator::IsExact : 0); } Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) { return get(Instruction::AShr, C1, C2, isExact ? PossiblyExactOperator::IsExact : 0); } /// getBinOpIdentity - Return the identity for the given binary operation, /// i.e. a constant C such that X op C = X and C op X = X for every X. It /// returns null if the operator doesn't have an identity. Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) { switch (Opcode) { default: // Doesn't have an identity. return nullptr; case Instruction::Add: case Instruction::Or: case Instruction::Xor: return Constant::getNullValue(Ty); case Instruction::Mul: return ConstantInt::get(Ty, 1); case Instruction::And: return Constant::getAllOnesValue(Ty); } } /// getBinOpAbsorber - Return the absorbing element for the given binary /// operation, i.e. a constant C such that X op C = C and C op X = C for /// every X. For example, this returns zero for integer multiplication. /// It returns null if the operator doesn't have an absorbing element. Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) { switch (Opcode) { default: // Doesn't have an absorber. return nullptr; case Instruction::Or: return Constant::getAllOnesValue(Ty); case Instruction::And: case Instruction::Mul: return Constant::getNullValue(Ty); } } // destroyConstant - Remove the constant from the constant table... // void ConstantExpr::destroyConstant() { getType()->getContext().pImpl->ExprConstants.remove(this); destroyConstantImpl(); } const char *ConstantExpr::getOpcodeName() const { return Instruction::getOpcodeName(getOpcode()); } GetElementPtrConstantExpr:: GetElementPtrConstantExpr(Constant *C, ArrayRef IdxList, Type *DestTy) : ConstantExpr(DestTy, Instruction::GetElementPtr, OperandTraits::op_end(this) - (IdxList.size()+1), IdxList.size()+1) { OperandList[0] = C; for (unsigned i = 0, E = IdxList.size(); i != E; ++i) OperandList[i+1] = IdxList[i]; } //===----------------------------------------------------------------------===// // ConstantData* implementations void ConstantDataArray::anchor() {} void ConstantDataVector::anchor() {} /// getElementType - Return the element type of the array/vector. Type *ConstantDataSequential::getElementType() const { return getType()->getElementType(); } StringRef ConstantDataSequential::getRawDataValues() const { return StringRef(DataElements, getNumElements()*getElementByteSize()); } /// isElementTypeCompatible - Return true if a ConstantDataSequential can be /// formed with a vector or array of the specified element type. /// ConstantDataArray only works with normal float and int types that are /// stored densely in memory, not with things like i42 or x86_f80. bool ConstantDataSequential::isElementTypeCompatible(const Type *Ty) { if (Ty->isFloatTy() || Ty->isDoubleTy()) return true; if (const IntegerType *IT = dyn_cast(Ty)) { switch (IT->getBitWidth()) { case 8: case 16: case 32: case 64: return true; default: break; } } return false; } /// getNumElements - Return the number of elements in the array or vector. unsigned ConstantDataSequential::getNumElements() const { if (ArrayType *AT = dyn_cast(getType())) return AT->getNumElements(); return getType()->getVectorNumElements(); } /// getElementByteSize - Return the size in bytes of the elements in the data. uint64_t ConstantDataSequential::getElementByteSize() const { return getElementType()->getPrimitiveSizeInBits()/8; } /// getElementPointer - Return the start of the specified element. const char *ConstantDataSequential::getElementPointer(unsigned Elt) const { assert(Elt < getNumElements() && "Invalid Elt"); return DataElements+Elt*getElementByteSize(); } /// isAllZeros - return true if the array is empty or all zeros. static bool isAllZeros(StringRef Arr) { for (StringRef::iterator I = Arr.begin(), E = Arr.end(); I != E; ++I) if (*I != 0) return false; return true; } /// getImpl - This is the underlying implementation of all of the /// ConstantDataSequential::get methods. They all thunk down to here, providing /// the correct element type. We take the bytes in as a StringRef because /// we *want* an underlying "char*" to avoid TBAA type punning violations. Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) { assert(isElementTypeCompatible(Ty->getSequentialElementType())); // If the elements are all zero or there are no elements, return a CAZ, which // is more dense and canonical. if (isAllZeros(Elements)) return ConstantAggregateZero::get(Ty); // Do a lookup to see if we have already formed one of these. auto &Slot = *Ty->getContext() .pImpl->CDSConstants.insert(std::make_pair(Elements, nullptr)) .first; // The bucket can point to a linked list of different CDS's that have the same // body but different types. For example, 0,0,0,1 could be a 4 element array // of i8, or a 1-element array of i32. They'll both end up in the same /// StringMap bucket, linked up by their Next pointers. Walk the list. ConstantDataSequential **Entry = &Slot.second; for (ConstantDataSequential *Node = *Entry; Node; Entry = &Node->Next, Node = *Entry) if (Node->getType() == Ty) return Node; // Okay, we didn't get a hit. Create a node of the right class, link it in, // and return it. if (isa(Ty)) return *Entry = new ConstantDataArray(Ty, Slot.first().data()); assert(isa(Ty)); return *Entry = new ConstantDataVector(Ty, Slot.first().data()); } void ConstantDataSequential::destroyConstant() { // Remove the constant from the StringMap. StringMap &CDSConstants = getType()->getContext().pImpl->CDSConstants; StringMap::iterator Slot = CDSConstants.find(getRawDataValues()); assert(Slot != CDSConstants.end() && "CDS not found in uniquing table"); ConstantDataSequential **Entry = &Slot->getValue(); // Remove the entry from the hash table. if (!(*Entry)->Next) { // If there is only one value in the bucket (common case) it must be this // entry, and removing the entry should remove the bucket completely. assert((*Entry) == this && "Hash mismatch in ConstantDataSequential"); getContext().pImpl->CDSConstants.erase(Slot); } else { // Otherwise, there are multiple entries linked off the bucket, unlink the // node we care about but keep the bucket around. for (ConstantDataSequential *Node = *Entry; ; Entry = &Node->Next, Node = *Entry) { assert(Node && "Didn't find entry in its uniquing hash table!"); // If we found our entry, unlink it from the list and we're done. if (Node == this) { *Entry = Node->Next; break; } } } // If we were part of a list, make sure that we don't delete the list that is // still owned by the uniquing map. Next = nullptr; // Finally, actually delete it. destroyConstantImpl(); } /// get() constructors - Return a constant with array type with an element /// count and element type matching the ArrayRef passed in. Note that this /// can return a ConstantAggregateZero object. Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef Elts) { Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*1), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*2), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*4), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*8), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef Elts) { Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*4), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef Elts) { Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*8), Ty); } /// getString - This method constructs a CDS and initializes it with a text /// string. The default behavior (AddNull==true) causes a null terminator to /// be placed at the end of the array (increasing the length of the string by /// one more than the StringRef would normally indicate. Pass AddNull=false /// to disable this behavior. Constant *ConstantDataArray::getString(LLVMContext &Context, StringRef Str, bool AddNull) { if (!AddNull) { const uint8_t *Data = reinterpret_cast(Str.data()); return get(Context, makeArrayRef(const_cast(Data), Str.size())); } SmallVector ElementVals; ElementVals.append(Str.begin(), Str.end()); ElementVals.push_back(0); return get(Context, ElementVals); } /// get() constructors - Return a constant with vector type with an element /// count and element type matching the ArrayRef passed in. Note that this /// can return a ConstantAggregateZero object. Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*1), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*2), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*4), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef Elts){ Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*8), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef Elts) { Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*4), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef Elts) { Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size()); const char *Data = reinterpret_cast(Elts.data()); return getImpl(StringRef(const_cast(Data), Elts.size()*8), Ty); } Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) { assert(isElementTypeCompatible(V->getType()) && "Element type not compatible with ConstantData"); if (ConstantInt *CI = dyn_cast(V)) { if (CI->getType()->isIntegerTy(8)) { SmallVector Elts(NumElts, CI->getZExtValue()); return get(V->getContext(), Elts); } if (CI->getType()->isIntegerTy(16)) { SmallVector Elts(NumElts, CI->getZExtValue()); return get(V->getContext(), Elts); } if (CI->getType()->isIntegerTy(32)) { SmallVector Elts(NumElts, CI->getZExtValue()); return get(V->getContext(), Elts); } assert(CI->getType()->isIntegerTy(64) && "Unsupported ConstantData type"); SmallVector Elts(NumElts, CI->getZExtValue()); return get(V->getContext(), Elts); } if (ConstantFP *CFP = dyn_cast(V)) { if (CFP->getType()->isFloatTy()) { SmallVector Elts(NumElts, CFP->getValueAPF().convertToFloat()); return get(V->getContext(), Elts); } if (CFP->getType()->isDoubleTy()) { SmallVector Elts(NumElts, CFP->getValueAPF().convertToDouble()); return get(V->getContext(), Elts); } } return ConstantVector::getSplat(NumElts, V); } /// getElementAsInteger - If this is a sequential container of integers (of /// any size), return the specified element in the low bits of a uint64_t. uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const { assert(isa(getElementType()) && "Accessor can only be used when element is an integer"); const char *EltPtr = getElementPointer(Elt); // The data is stored in host byte order, make sure to cast back to the right // type to load with the right endianness. switch (getElementType()->getIntegerBitWidth()) { default: llvm_unreachable("Invalid bitwidth for CDS"); case 8: return *const_cast(reinterpret_cast(EltPtr)); case 16: return *const_cast(reinterpret_cast(EltPtr)); case 32: return *const_cast(reinterpret_cast(EltPtr)); case 64: return *const_cast(reinterpret_cast(EltPtr)); } } /// getElementAsAPFloat - If this is a sequential container of floating point /// type, return the specified element as an APFloat. APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const { const char *EltPtr = getElementPointer(Elt); switch (getElementType()->getTypeID()) { default: llvm_unreachable("Accessor can only be used when element is float/double!"); case Type::FloatTyID: { const float *FloatPrt = reinterpret_cast(EltPtr); return APFloat(*const_cast(FloatPrt)); } case Type::DoubleTyID: { const double *DoublePtr = reinterpret_cast(EltPtr); return APFloat(*const_cast(DoublePtr)); } } } /// getElementAsFloat - If this is an sequential container of floats, return /// the specified element as a float. float ConstantDataSequential::getElementAsFloat(unsigned Elt) const { assert(getElementType()->isFloatTy() && "Accessor can only be used when element is a 'float'"); const float *EltPtr = reinterpret_cast(getElementPointer(Elt)); return *const_cast(EltPtr); } /// getElementAsDouble - If this is an sequential container of doubles, return /// the specified element as a float. double ConstantDataSequential::getElementAsDouble(unsigned Elt) const { assert(getElementType()->isDoubleTy() && "Accessor can only be used when element is a 'float'"); const double *EltPtr = reinterpret_cast(getElementPointer(Elt)); return *const_cast(EltPtr); } /// getElementAsConstant - Return a Constant for a specified index's element. /// Note that this has to compute a new constant to return, so it isn't as /// efficient as getElementAsInteger/Float/Double. Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const { if (getElementType()->isFloatTy() || getElementType()->isDoubleTy()) return ConstantFP::get(getContext(), getElementAsAPFloat(Elt)); return ConstantInt::get(getElementType(), getElementAsInteger(Elt)); } /// isString - This method returns true if this is an array of i8. bool ConstantDataSequential::isString() const { return isa(getType()) && getElementType()->isIntegerTy(8); } /// isCString - This method returns true if the array "isString", ends with a /// nul byte, and does not contains any other nul bytes. bool ConstantDataSequential::isCString() const { if (!isString()) return false; StringRef Str = getAsString(); // The last value must be nul. if (Str.back() != 0) return false; // Other elements must be non-nul. return Str.drop_back().find(0) == StringRef::npos; } /// getSplatValue - If this is a splat constant, meaning that all of the /// elements have the same value, return that value. Otherwise return nullptr. Constant *ConstantDataVector::getSplatValue() const { const char *Base = getRawDataValues().data(); // Compare elements 1+ to the 0'th element. unsigned EltSize = getElementByteSize(); for (unsigned i = 1, e = getNumElements(); i != e; ++i) if (memcmp(Base, Base+i*EltSize, EltSize)) return nullptr; // If they're all the same, return the 0th one as a representative. return getElementAsConstant(0); } //===----------------------------------------------------------------------===// // replaceUsesOfWithOnConstant implementations /// replaceUsesOfWithOnConstant - Update this constant array to change uses of /// 'From' to be uses of 'To'. This must update the uniquing data structures /// etc. /// /// Note that we intentionally replace all uses of From with To here. Consider /// a large array that uses 'From' 1000 times. By handling this case all here, /// ConstantArray::replaceUsesOfWithOnConstant is only invoked once, and that /// single invocation handles all 1000 uses. Handling them one at a time would /// work, but would be really slow because it would have to unique each updated /// array instance. /// void Constant::replaceUsesOfWithOnConstantImpl(Constant *Replacement) { // I do need to replace this with an existing value. assert(Replacement != this && "I didn't contain From!"); // Everyone using this now uses the replacement. replaceAllUsesWith(Replacement); // Delete the old constant! destroyConstant(); } void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { assert(isa(To) && "Cannot make Constant refer to non-constant!"); Constant *ToC = cast(To); SmallVector Values; Values.reserve(getNumOperands()); // Build replacement array. // Fill values with the modified operands of the constant array. Also, // compute whether this turns into an all-zeros array. unsigned NumUpdated = 0; // Keep track of whether all the values in the array are "ToC". bool AllSame = true; for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) { Constant *Val = cast(O->get()); if (Val == From) { Val = ToC; ++NumUpdated; } Values.push_back(Val); AllSame &= Val == ToC; } if (AllSame && ToC->isNullValue()) { replaceUsesOfWithOnConstantImpl(ConstantAggregateZero::get(getType())); return; } if (AllSame && isa(ToC)) { replaceUsesOfWithOnConstantImpl(UndefValue::get(getType())); return; } // Check for any other type of constant-folding. if (Constant *C = getImpl(getType(), Values)) { replaceUsesOfWithOnConstantImpl(C); return; } // Update to the new value. if (Constant *C = getContext().pImpl->ArrayConstants.replaceOperandsInPlace( Values, this, From, ToC, NumUpdated, U - OperandList)) replaceUsesOfWithOnConstantImpl(C); } void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { assert(isa(To) && "Cannot make Constant refer to non-constant!"); Constant *ToC = cast(To); unsigned OperandToUpdate = U-OperandList; assert(getOperand(OperandToUpdate) == From && "ReplaceAllUsesWith broken!"); SmallVector Values; Values.reserve(getNumOperands()); // Build replacement struct. // Fill values with the modified operands of the constant struct. Also, // compute whether this turns into an all-zeros struct. bool isAllZeros = false; bool isAllUndef = false; if (ToC->isNullValue()) { isAllZeros = true; for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) { Constant *Val = cast(O->get()); Values.push_back(Val); if (isAllZeros) isAllZeros = Val->isNullValue(); } } else if (isa(ToC)) { isAllUndef = true; for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) { Constant *Val = cast(O->get()); Values.push_back(Val); if (isAllUndef) isAllUndef = isa(Val); } } else { for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) Values.push_back(cast(O->get())); } Values[OperandToUpdate] = ToC; if (isAllZeros) { replaceUsesOfWithOnConstantImpl(ConstantAggregateZero::get(getType())); return; } if (isAllUndef) { replaceUsesOfWithOnConstantImpl(UndefValue::get(getType())); return; } // Update to the new value. if (Constant *C = getContext().pImpl->StructConstants.replaceOperandsInPlace( Values, this, From, ToC)) replaceUsesOfWithOnConstantImpl(C); } void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { assert(isa(To) && "Cannot make Constant refer to non-constant!"); Constant *ToC = cast(To); SmallVector Values; Values.reserve(getNumOperands()); // Build replacement array... unsigned NumUpdated = 0; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { Constant *Val = getOperand(i); if (Val == From) { ++NumUpdated; Val = ToC; } Values.push_back(Val); } if (Constant *C = getImpl(Values)) { replaceUsesOfWithOnConstantImpl(C); return; } // Update to the new value. if (Constant *C = getContext().pImpl->VectorConstants.replaceOperandsInPlace( Values, this, From, ToC, NumUpdated, U - OperandList)) replaceUsesOfWithOnConstantImpl(C); } void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV, Use *U) { assert(isa(ToV) && "Cannot make Constant refer to non-constant!"); Constant *To = cast(ToV); SmallVector NewOps; unsigned NumUpdated = 0; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { Constant *Op = getOperand(i); if (Op == From) { ++NumUpdated; Op = To; } NewOps.push_back(Op); } assert(NumUpdated && "I didn't contain From!"); if (Constant *C = getWithOperands(NewOps, getType(), true)) { replaceUsesOfWithOnConstantImpl(C); return; } // Update to the new value. if (Constant *C = getContext().pImpl->ExprConstants.replaceOperandsInPlace( NewOps, this, From, To, NumUpdated, U - OperandList)) replaceUsesOfWithOnConstantImpl(C); } Instruction *ConstantExpr::getAsInstruction() { SmallVector ValueOperands; for (op_iterator I = op_begin(), E = op_end(); I != E; ++I) ValueOperands.push_back(cast(I)); ArrayRef Ops(ValueOperands); switch (getOpcode()) { case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::UIToFP: case Instruction::SIToFP: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: case Instruction::AddrSpaceCast: return CastInst::Create((Instruction::CastOps)getOpcode(), Ops[0], getType()); case Instruction::Select: return SelectInst::Create(Ops[0], Ops[1], Ops[2]); case Instruction::InsertElement: return InsertElementInst::Create(Ops[0], Ops[1], Ops[2]); case Instruction::ExtractElement: return ExtractElementInst::Create(Ops[0], Ops[1]); case Instruction::InsertValue: return InsertValueInst::Create(Ops[0], Ops[1], getIndices()); case Instruction::ExtractValue: return ExtractValueInst::Create(Ops[0], getIndices()); case Instruction::ShuffleVector: return new ShuffleVectorInst(Ops[0], Ops[1], Ops[2]); case Instruction::GetElementPtr: if (cast(this)->isInBounds()) return GetElementPtrInst::CreateInBounds(Ops[0], Ops.slice(1)); else return GetElementPtrInst::Create(Ops[0], Ops.slice(1)); case Instruction::ICmp: case Instruction::FCmp: return CmpInst::Create((Instruction::OtherOps)getOpcode(), getPredicate(), Ops[0], Ops[1]); default: assert(getNumOperands() == 2 && "Must be binary operator?"); BinaryOperator *BO = BinaryOperator::Create((Instruction::BinaryOps)getOpcode(), Ops[0], Ops[1]); if (isa(BO)) { BO->setHasNoUnsignedWrap(SubclassOptionalData & OverflowingBinaryOperator::NoUnsignedWrap); BO->setHasNoSignedWrap(SubclassOptionalData & OverflowingBinaryOperator::NoSignedWrap); } if (isa(BO)) BO->setIsExact(SubclassOptionalData & PossiblyExactOperator::IsExact); return BO; } } Index: projects/clang360-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 279025) @@ -1,26693 +1,26857 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include "X86IntrinsicsInfo.h" #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt ExperimentalVectorWideningLegalization( "x86-experimental-vector-widening-legalization", cl::init(false), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); static cl::opt ExperimentalVectorShuffleLowering( "x86-experimental-vector-shuffle-lowering", cl::init(true), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); static cl::opt ExperimentalVectorShuffleLegality( "x86-experimental-vector-shuffle-legality", cl::init(false), cl::desc("Enable experimental shuffle legality based on the experimental " "shuffle lowering. Should only be used with the experimental " "shuffle lowering."), cl::Hidden); static cl::opt ReciprocalEstimateRefinementSteps( "x86-recip-refinement-steps", cl::init(1), cl::desc("Specify the number of Newton-Raphson iterations applied to the " "result of the hardware reciprocal estimate instruction."), cl::NotHidden); // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); // This is the index of the first element of the vectorWidth-bit chunk // we want. unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) * ElemsPerChunk); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, makeArrayRef(Vec->op_begin() + NormalizedIdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue InsertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); // This is the index of the first element of the vectorWidth-bit chunk // we want. unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower /// large BUILD_VECTORS. static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } // FIXME: This should stop caching the target machine as soon as // we can remove resetOperationActions et al. X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) : TargetLowering(TM) { Subtarget = &TM.getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); TD = getDataLayout(); resetOperationActions(); } void X86TargetLowering::resetOperationActions() { const TargetMachine &TM = getTargetMachine(); static bool FirstTimeThrough = true; // If none of the target options have changed, then we don't need to reset the // operation actions. if (!FirstTimeThrough && TO == TM.Options) return; if (!FirstTimeThrough) { // Reinitialize the actions. initActions(); FirstTimeThrough = false; } TO = TM.Options; // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = TM.getSubtarget().getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget->hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } if (Subtarget->isTargetKnownWindowsMSVC()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); // The _ftol2 runtime function has an unusual calling conv, which // is modeled by a special pseudo-instruction. setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); } else if (Subtarget->isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); } else { setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); if (!TM.Options.UseSoftFloat) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); if (X86ScalarSSEf32) { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); } else if (!TM.Options.UseSoftFloat) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); } if (isTargetFTOL()) { // Use the _ftol2 runtime function, which has a pseudo-instruction // to handle its weird calling convention. setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget->is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. setOperationAction(ISD::ADDC, VT, Custom); setOperationAction(ISD::ADDE, VT, Custom); setOperationAction(ISD::SUBC, VT, Custom); setOperationAction(ISD::SUBE, VT, Custom); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); setOperationAction(ISD::BR_CC , MVT::i64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationAction(ISD::CTTZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } if (Subtarget->hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget->hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); setOperationAction(ISD::SELECT , MVT::i16 , Custom); setOperationAction(ISD::SELECT , MVT::i32 , Custom); setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // Darwin ABI issue. setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); setOperationAction(ISD::JumpTable , MVT::i32 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); setOperationAction(ISD::JumpTable , MVT::i64 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); } // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (Subtarget->hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } if (Subtarget->is64Bit()) { setExceptionPointerRegister(X86::RAX); setExceptionSelectorRegister(X86::RDX); } else { setExceptionPointerRegister(X86::EAX); setExceptionSelectorRegister(X86::EDX); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { // TargetInfo::X86_64ABIBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::FR64RegClass); // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS , MVT::f64, Custom); setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f64, Custom); setOperationAction(ISD::FNEG , MVT::f32, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // Lower this to FGETSIGNx86 plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Expand FP immediates into loads from the stack, except for the special // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); setOperationAction(ISD::UNDEF, MVT::f64, Expand); setOperationAction(ISD::UNDEF, MVT::f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. if (!TM.Options.UseSoftFloat) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); } setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::MUL , VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::LOAD, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. setOperationAction(ISD::MULHS, MVT::v8i8, Expand); setOperationAction(ISD::MULHS, MVT::v4i16, Expand); setOperationAction(ISD::MULHS, MVT::v2i32, Expand); setOperationAction(ISD::MULHS, MVT::v1i64, Expand); setOperationAction(ISD::AND, MVT::v8i8, Expand); setOperationAction(ISD::AND, MVT::v4i16, Expand); setOperationAction(ISD::AND, MVT::v2i32, Expand); setOperationAction(ISD::AND, MVT::v1i64, Expand); setOperationAction(ISD::OR, MVT::v8i8, Expand); setOperationAction(ISD::OR, MVT::v4i16, Expand); setOperationAction(ISD::OR, MVT::v2i32, Expand); setOperationAction(ISD::OR, MVT::v1i64, Expand); setOperationAction(ISD::XOR, MVT::v8i8, Expand); setOperationAction(ISD::XOR, MVT::v4i16, Expand); setOperationAction(ISD::XOR, MVT::v2i32, Expand); setOperationAction(ISD::XOR, MVT::v1i64, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); setOperationAction(ISD::SELECT, MVT::v8i8, Expand); setOperationAction(ISD::SELECT, MVT::v4i16, Expand); setOperationAction(ISD::SELECT, MVT::v2i32, Expand); setOperationAction(ISD::SELECT, MVT::v1i64, Expand); setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); addRegisterClass(MVT::v4i32, &X86::VR128RegClass); addRegisterClass(MVT::v2i64, &X86::VR128RegClass); setOperationAction(ISD::ADD, MVT::v16i8, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); setOperationAction(ISD::SUB, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FADD, MVT::v2f64, Legal); setOperationAction(ISD::FSUB, MVT::v2f64, Legal); setOperationAction(ISD::FMUL, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); setOperationAction(ISD::SETCC, MVT::v8i16, Custom); setOperationAction(ISD::SETCC, MVT::v4i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); // Only provide customized ctpop vector bit twiddling for vector types we // know to perform better than using the popcnt instructions on each vector // element. If popcnt isn't supported, always provide the custom version. if (!Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); } // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to custom lower non-power-of-2 vectors if (!isPowerOf2_32(VT.getVectorNumElements())) continue; // Do not attempt to custom lower non-128-bit vectors if (!VT.is128BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // We support custom legalizing of sext and anyext loads for specific // memory vector types which we can load as a scalar (or sequence of // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-128-bit vectors if (!VT.is128BitVector()) continue; setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v2i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v2i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom // sequence to convert from v2i32 to v2f32. if (!Subtarget->is64Bit()) setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FRINT, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); setOperationAction(ISD::FRINT, MVT::v2f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); // There is no BLENDI for byte vectors. We don't need to custom lower // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); // FIXME: these should be Legal, but that's only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } if (Subtarget->hasSSE2()) { setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v2i64, Custom); setOperationAction(ISD::SRL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); addRegisterClass(MVT::v8f32, &X86::VR256RegClass); addRegisterClass(MVT::v4i64, &X86::VR256RegClass); addRegisterClass(MVT::v4f64, &X86::VR256RegClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); setOperationAction(ISD::LOAD, MVT::v4i64, Legal); setOperationAction(ISD::FADD, MVT::v8f32, Legal); setOperationAction(ISD::FSUB, MVT::v8f32, Legal); setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); setOperationAction(ISD::FRINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); setOperationAction(ISD::FRINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); setOperationAction(ISD::SETCC, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::f32, Legal); setOperationAction(ISD::FMA, MVT::f64, Legal); } if (Subtarget->hasInt256()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); setOperationAction(ISD::ADD, MVT::v16i16, Legal); setOperationAction(ISD::ADD, MVT::v32i8, Legal); setOperationAction(ISD::SUB, MVT::v4i64, Legal); setOperationAction(ISD::SUB, MVT::v8i32, Legal); setOperationAction(ISD::SUB, MVT::v16i16, Legal); setOperationAction(ISD::SUB, MVT::v32i8, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); // Don't lower v32i8 because there is no 128-bit byte mul setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); // Only provide customized ctpop vector bit twiddling for vector types we // know to perform better than using the popcnt instructions on each // vector element. If popcnt isn't supported, always provide the custom // version. if (!Subtarget->hasPOPCNT()) setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); // Custom CTPOP always performs better on natively supported v8i32 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); setOperationAction(ISD::ADD, MVT::v16i16, Custom); setOperationAction(ISD::ADD, MVT::v32i8, Custom); setOperationAction(ISD::SUB, MVT::v4i64, Custom); setOperationAction(ISD::SUB, MVT::v8i32, Custom); setOperationAction(ISD::SUB, MVT::v16i16, Custom); setOperationAction(ISD::SUB, MVT::v32i8, Custom); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); // Don't lower v32i8 because there is no 128-bit byte mul } // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v4i64, Custom); setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v4i64, Custom); setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. for (MVT VT : MVT::vector_valuetypes()) { if (VT.getScalarSizeInBits() >= 32) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v4i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v4i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); } } if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); setOperationAction(ISD::LOAD, MVT::v16i32, Legal); setOperationAction(ISD::LOAD, MVT::v16i1, Legal); setOperationAction(ISD::FADD, MVT::v16f32, Legal); setOperationAction(ISD::FSUB, MVT::v16f32, Legal); setOperationAction(ISD::FMUL, MVT::v16f32, Legal); setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); setOperationAction(ISD::FMUL, MVT::v8f64, Legal); setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); setOperationAction(ISD::SUB, MVT::v8i64, Legal); setOperationAction(ISD::SUB, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::SRL, MVT::v8i64, Custom); setOperationAction(ISD::SRL, MVT::v16i32, Custom); setOperationAction(ISD::SHL, MVT::v8i64, Custom); setOperationAction(ISD::SHL, MVT::v16i32, Custom); setOperationAction(ISD::SRA, MVT::v8i64, Custom); setOperationAction(ISD::SRA, MVT::v16i32, Custom); setOperationAction(ISD::AND, MVT::v8i64, Legal); setOperationAction(ISD::OR, MVT::v8i64, Legal); setOperationAction(ISD::XOR, MVT::v8i64, Legal); setOperationAction(ISD::AND, MVT::v16i32, Legal); setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); } // Custom lower several nodes. for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); // Do not attempt to custom lower other non-512-bit vectors if (!VT.is512BitVector()) continue; if ( EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-512-bit vectors. if (!VT.is512BitVector()) continue; setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); setOperationAction(ISD::LOAD, MVT::v32i16, Legal); setOperationAction(ISD::LOAD, MVT::v64i8, Legal); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); setOperationAction(ISD::ADD, MVT::v32i16, Legal); setOperationAction(ISD::ADD, MVT::v64i8, Legal); setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Do not attempt to promote non-512-bit vectors. if (!VT.is512BitVector()) continue; if (EltSize < 32) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); } } } if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); setOperationAction(ISD::XOR, MVT::v8i32, Legal); setOperationAction(ISD::AND, MVT::v4i32, Legal); setOperationAction(ISD::OR, MVT::v4i32, Legal); setOperationAction(ISD::XOR, MVT::v4i32, Legal); } // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (MVT VT : MVT::vector_valuetypes()) setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget->is64Bit()) setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { // Add/Sub/Mul with overflow operations are custom lowered. MVT VT = IntVTs[i]; setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); } if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { // For MacOSX, we don't want the normal expansion of a libcall to sincos. // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } } if (Subtarget->isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); // On Darwin, -Os means optimize for size without hurting performance, // do not reduce the limit. MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(4); // 2^4 bytes. // Predictable cmov don't hurt on atom because it's in-order. PredictableSelectIsExpensive = !Subtarget->isAtom(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. verifyIntrinsicTables(); } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO() && Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && VT.getVectorNumElements() != 1 && VT.getVectorElementType().getSimpleVT() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; const unsigned NumElts = VT.getVectorNumElements(); const EVT EltVT = VT.getVectorElementType(); if (VT.is512BitVector()) { if (Subtarget->hasAVX512()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; } if (Subtarget->hasBWI()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 32: return MVT::v32i1; case 64: return MVT::v64i1; } } if (VT.is256BitVector() || VT.is128BitVector()) { if (Subtarget->hasVLX()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 2: return MVT::v2i1; case 4: return MVT::v4i1; case 8: return MVT::v8i1; } if (Subtarget->hasBWI() && Subtarget->hasVLX()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; case 32: return MVT::v32i1; } } return VT.changeVectorElementTypeToInteger(); } /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { unsigned EltAlign = 0; getMaxByValAlign(STy->getElementType(i), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { if (Subtarget->is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = TD->getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) return MVT::v8f32; } if (Subtarget->hasSSE2()) return MVT::v4i32; if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; } } if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; } /// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::Create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); return Table; } /// This returns the relocation base for the given PIC jumptable, /// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget->isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } // FIXME: Why this routine is here? Move to RegInfo! std::pair X86TargetLowering::findRepresentativeClass(MVT VT) const{ const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: RRC = &X86::VR128RegClass; break; } return std::make_pair(RRC, Cost); } bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const { if (!Subtarget->isTargetLinux()) return false; if (Subtarget->is64Bit()) { // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: Offset = 0x28; if (getTargetMachine().getCodeModel() == CodeModel::Kernel) AddressSpace = 256; else AddressSpace = 257; } else { // %gs:0x14 on i386 Offset = 0x14; AddressSpace = 256; } return true; } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" bool X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, SDLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), MVT::i16)); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget->is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } } Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } // The x86-64 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // Win32 requires us to put the sret argument to %eax as well. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; // If we are returning more than one value, we can definitely // not make a tail call see PR19530 if (UI->getNumOperands() > 4) return false; if (UI->getNumOperands() == 4 && UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } EVT X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) ReturnMVT = MVT::i8; else ReturnMVT = MVT::i32; EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // Assign locations to each value returned by this call. SmallVector RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); SDValue Val = Chain.getValue(0); if (CopyVT != VA.getValVT()) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1)); InFlag = Chain.getValue(2); InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(const SmallVectorImpl &Outs) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg()) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl &Ins) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg()) return RegStructReturn; return StackStructReturn; } /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that /// supports tail call optimization. static bool IsTailCallConvention(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE); } /// \brief Return true if the calling convention is a C calling convention. static bool IsCCallConvention(CallingConv::ID CC) { return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || CC == CallingConv::X86_64_SysV); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; return true; } /// Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && IsTailCallConvention(CC); } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = FuncIsMadeTailCallSafe( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; // If value is passed by pointer we have address passed instead of the value // itself. if (VA.getLocInfo() == CCValAssign::Indirect) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); return DAG.getFrameIndex(FI, getPointerTy()); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); } } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, const X86Subtarget *Subtarget) { assert(Subtarget->is64Bit()); if (Subtarget->isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, const X86Subtarget *Subtarget) { assert(Subtarget->is64Bit()); if (Subtarget->isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. // TODO: __vectorcall will change this. return None; } const Function *Fn = MF.getFunction(); bool NoImplicitFloatOps = Fn->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } SDValue X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Function* Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && Subtarget->isTargetCygMing() && Fn->getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && IsTailCallConvention(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeFormalArguments(Ins, CC_X86); unsigned LastVal = ~0U; SDValue ArgValue; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; // TODO: If an arg is passed in two places (e.g. reg and stack), skip later // places. assert(VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"); (void)LastVal; LastVal = VA.getValNo(); if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else if (RegVT == MVT::v32i1) RC = &X86::VK32RegClass; else if (RegVT == MVT::v64i1) RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector()) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // The x86-64 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // Win32 requires us to put the sret argument to %eax as well. // Save the argument into a virtual register so that we can access it // from the return points. if (Ins[i].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (FuncIsMadeTailCallSafe(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. if (MFI->hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. assert(!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. if (Is64Bit && isVarArg && MFI->hasVAStart()) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. SmallVector LiveGPRs; SmallVector LiveXMMRegs; SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); LiveXMMRegs.push_back( DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } } if (IsWin64) { const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(Offset)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( FuncInfo->getRegSaveFrameIndex(), Offset), false, false, 0); MemOps.push_back(Store); Offset += 8; } if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector SaveXMMOps; SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex())); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset())); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget->hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; else if (Subtarget->hasAVX()) VecVT = MVT::v8f32; else if (Subtarget->hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. SmallVector RegParmTypes; MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; RegParmTypes.push_back(IntVT); if (VecVT != MVT::Other) RegParmTypes.push_back(VecVT); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } // Copy all forwards from physical to virtual registers. for (ForwardedRegister &F : Forwards) { // FIXME: Can we use a less constrained schedule? SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !IsTailCallConvention(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(LocMemOffset), false, false, 0); } /// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } /// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), false, false, 0); return Chain; } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && IsTailCallConvention(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; if (!ArgLocs.back().isMemLoc()) report_fatal_error("cannot use inalloca attribute on a register " "parameter"); if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); } if (!IsSibcall) Chain = DAG.getCALLSEQ_START( Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[i]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(FI), false, false, 0); Arg = SpillSlot; break; } } if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); if (G && !G->getGlobal()->hasHiddenVisibility() && !G->getGlobal()->hasProtectedVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, MVT::i8))); } if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector MemOpChains2; SDValue FIN; int FI = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) continue; assert(VA.isMemLoc()); SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy()); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back( DAG.getStore(ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(FI), false, false, 0)); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. GlobalAddressSDNode* G = cast(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportStorageClass()) { unsigned char OpFlags = 0; bool ExtraLoad = false; unsigned WrapperKind = ISD::DELETED_NODE; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa(GV) && cast(GV)->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). OpFlags = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; ExtraLoad = true; } Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), G->getOffset(), OpFlags); // Add a wrapper if needed. if (WrapperKind != ISD::DELETED_NODE) Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); // Add extra indirection if needed. if (ExtraLoad) Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; // On ELF targets, in either X86-64 or X86-32 mode, direct calls to // external symbols should go through the PLT. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector Ops; if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, true), DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned /// for a 16 byte align requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); const X86RegisterInfo *RegInfo = static_cast( TM.getSubtargetImpl()->getRegisterInfo()); const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; } /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII) { unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // An stdcall/thiscall caller is expected to clean up its arguments; the // callee isn't going to do that. // FIXME: this is more restrictive than needed. We could produce a tailcall // when the stack adjustment matches. For example, with a thiscall that takes // only one argument. if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || CallerCC == CallingConv::X86_ThisCall)) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) return false; for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) return false; if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) return false; if (RVLocs1[i].isRegLoc()) { if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; } else { if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; } } } // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); if (CCInfo.getNextStackOffset()) { MachineFunction &MF = DAG.getMachineFunction(); if (MF.getInfo()->getBytesToPopOnReturn()) return false; // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = static_cast(DAG.getSubtarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget->is64Bit() && ((!isa(Callee) && !isa(Callee)) || DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; unsigned Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: return true; } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: return DAG.getNode(Opc, dl, VT, V1); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGNR: case X86ISD::VALIGN: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset >= 0) return true; return false; } /// isCalleePop - Determines whether the callee is required to pop its /// own arguments. Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt) { switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: return !is64Bit; case CallingConv::Fast: case CallingConv::GHC: case CallingConv::HiPE: if (IsVarArg) return false; return TailCallOpt; } } /// \brief Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: return true; case X86::COND_G: return false; case X86::COND_GE: return false; case X86::COND_L: return false; case X86::COND_LE: return false; case X86::COND_NE: return true; case X86::COND_B: return true; case X86::COND_A: return true; case X86::COND_BE: return true; case X86::COND_AE: return true; } llvm_unreachable("covered switch fell through?!"); } /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 /// specific condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_LE; } } switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// hasFPCMov - is there a floating point cmov for the specific X86 condition /// code. Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; return true; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget->hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget->hasLZCNT(); } /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD. That is, it doesn't reference the other /// operand - by default will match for first operand. static bool isPSHUFDMask(ArrayRef Mask, MVT VT, bool TestSecondOperand = false) { if (VT != MVT::v4f32 && VT != MVT::v4i32 && VT != MVT::v2f64 && VT != MVT::v2i64) return false; unsigned NumElems = VT.getVectorNumElements(); unsigned Lo = TestSecondOperand ? NumElems : 0; unsigned Hi = Lo + NumElems; for (unsigned i = 0; i < NumElems; ++i) if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi)) return false; return true; } /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. static bool isPSHUFHWMask(ArrayRef Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; // Lower quadword copied in order or undef. if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) return false; // Upper quadword shuffled. for (unsigned i = 4; i != 8; ++i) if (!isUndefOrInRange(Mask[i], 4, 8)) return false; if (VT == MVT::v16i16) { // Lower quadword copied in order or undef. if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) return false; // Upper quadword shuffled. for (unsigned i = 12; i != 16; ++i) if (!isUndefOrInRange(Mask[i], 12, 16)) return false; } return true; } /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. static bool isPSHUFLWMask(ArrayRef Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; // Upper quadword copied in order. if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) return false; // Lower quadword shuffled. for (unsigned i = 0; i != 4; ++i) if (!isUndefOrInRange(Mask[i], 0, 4)) return false; if (VT == MVT::v16i16) { // Upper quadword copied in order. if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) return false; // Lower quadword shuffled. for (unsigned i = 8; i != 12; ++i) if (!isUndefOrInRange(Mask[i], 8, 12)) return false; } return true; } /// \brief Return true if the mask specifies a shuffle of elements that is /// suitable for input to intralane (palignr) or interlane (valign) vector /// right-shift. static bool isAlignrMask(ArrayRef Mask, MVT VT, bool InterLane) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. if (NumLaneElts == 2) return false; for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { unsigned i; for (i = 0; i != NumLaneElts; ++i) { if (Mask[i+l] >= 0) break; } // Lane is all undef, go to next lane if (i == NumLaneElts) continue; int Start = Mask[i+l]; // Make sure its in this lane in one of the sources if (!isUndefOrInRange(Start, l, l+NumLaneElts) && !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) return false; // If not lane 0, then we must match lane 0 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) return false; // Correct second source to be contiguous with first source if (Start >= (int)NumElts) Start -= NumElts - NumLaneElts; // Make sure we're shifting in the right direction. if (Start <= (int)(i+l)) return false; Start -= i; // Check the rest of the elements to see if they are consecutive. for (++i; i != NumLaneElts; ++i) { int Idx = Mask[i+l]; // Make sure its in this lane if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) return false; // If not lane 0, then we must match lane 0 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) return false; if (Idx >= (int)NumElts) Idx -= NumElts - NumLaneElts; if (!isUndefOrEqual(Idx, Start+i)) return false; } } return true; } /// \brief Return true if the node specifies a shuffle of elements that is /// suitable for input to PALIGNR. static bool isPALIGNRMask(ArrayRef Mask, MVT VT, const X86Subtarget *Subtarget) { if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || (VT.is256BitVector() && !Subtarget->hasInt256()) || VT.is512BitVector()) // FIXME: Add AVX512BW. return false; return isAlignrMask(Mask, VT, false); } /// \brief Return true if the node specifies a shuffle of elements that is /// suitable for input to VALIGN. static bool isVALIGNMask(ArrayRef Mask, MVT VT, const X86Subtarget *Subtarget) { // FIXME: Add AVX512VL. if (!VT.is512BitVector() || !Subtarget->hasAVX512()) return false; return isAlignrMask(Mask, VT, true); } /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void CommuteVectorShuffleMask(SmallVectorImpl &Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { int idx = Mask[i]; if (idx < 0) continue; else if (idx < (int)NumElems) Mask[i] = idx + NumElems; else Mask[i] = idx - NumElems; } } /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool Commuted = false) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElems = NumElems/NumLanes; if (NumLaneElems != 2 && NumLaneElems != 4) return false; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); bool symetricMaskRequired = (VT.getSizeInBits() >= 256) && (EltSize == 32); // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 // // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, // Y3..Y0, Y3..Y0, X3..X0, X3..X0 // // VSHUFPDY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // // SRC1 => X3 X2 X1 X0 // SRC2 => Y3 Y2 Y1 Y0 // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // SmallVector MaskVal(NumLaneElems, -1); unsigned HalfLaneElems = NumLaneElems/2; for (unsigned l = 0; l != NumElems; l += NumLaneElems) { for (unsigned i = 0; i != NumLaneElems; ++i) { int Idx = Mask[i+l]; unsigned RngStart = l + ((Commuted == (i Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 4) return false; // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 return isUndefOrEqual(Mask[0], 6) && isUndefOrEqual(Mask[1], 7) && isUndefOrEqual(Mask[2], 2) && isUndefOrEqual(Mask[3], 3); } /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> static bool isMOVHLPS_v_undef_Mask(ArrayRef Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 4) return false; return isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3) && isUndefOrEqual(Mask[2], 2) && isUndefOrEqual(Mask[3], 3); } /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. static bool isMOVLPMask(ArrayRef Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. static bool isMOVLHPSMask(ArrayRef Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i + e], i + NumElems)) return false; return true; } /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to INSERTPS. /// i. e: If all but one element come from the same vector. static bool isINSERTPSMask(ArrayRef Mask, MVT VT) { // TODO: Deal with AVX's VINSERTPS if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) return false; unsigned CorrectPosV1 = 0; unsigned CorrectPosV2 = 0; for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { if (Mask[i] == -1) { ++CorrectPosV1; ++CorrectPosV2; continue; } if (Mask[i] == i) ++CorrectPosV1; else if (Mask[i] == i + 4) ++CorrectPosV2; } if (CorrectPosV1 == 3 || CorrectPosV2 == 3) // We have 3 elements (undefs count as elements from any vector) from one // vector, and one from another. return true; return false; } // // Some special combinations that can be optimized. // static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); SDLoc dl(SVOp); if (VT != MVT::v8i32 && VT != MVT::v8f32) return SDValue(); ArrayRef Mask = SVOp->getMask(); // These are the special masks that may be optimized. static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; bool MatchEvenMask = true; bool MatchOddMask = true; for (int i=0; i<8; ++i) { if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) MatchEvenMask = false; if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) MatchOddMask = false; } if (!MatchEvenMask && !MatchOddMask) return SDValue(); SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); SDValue Op0 = SVOp->getOperand(0); SDValue Op1 = SVOp->getOperand(1); if (MatchEvenMask) { // Shift the second operand right to 32 bits. static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); } else { // Shift the first operand left to 32 bits. static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); } static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); } /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(ArrayRef Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"); unsigned NumElts = VT.getVectorNumElements(); if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && "Unsupported vector type for unpckh"); // AVX defines UNPCK* to operate independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l+i]; int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { if (!isUndefOrEqual(BitI1, NumElts)) return false; } else { if (!isUndefOrEqual(BitI1, j + NumElts)) return false; } } } return true; } /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. static bool isUNPCKHMask(ArrayRef Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); unsigned NumElts = VT.getVectorNumElements(); if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && "Unsupported vector type for unpckh"); // AVX defines UNPCK* to operate independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l+i]; int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { if (isUndefOrEqual(BitI1, NumElts)) return false; } else { if (!isUndefOrEqual(BitI1, j+NumElts)) return false; } } } return true; } /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> static bool isUNPCKL_v_undef_Mask(ArrayRef Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); bool Is256BitVec = VT.is256BitVector(); if (VT.is512BitVector()) return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (Is256BitVec && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern // FIXME: Need a better way to get rid of this, there's no latency difference // between UNPCKLPD and MOVDDUP, the later should always be checked first and // the former later. We should also remove the "_undef" special mask. if (NumElts == 4 && Is256BitVec) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l+i]; int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) return false; } } return true; } /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> static bool isUNPCKH_v_undef_Mask(ArrayRef Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); if (VT.is512BitVector()) return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l+i]; int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) return false; } } return true; } // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or // (src1[0], src0[1]), manipulation with 256-bit sub-vectors static bool isINSERT64x4Mask(ArrayRef Mask, MVT VT, unsigned int *Imm) { if (!VT.is512BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned HalfSize = NumElts/2; if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { *Imm = 1; return true; } } if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { *Imm = 0; return true; } } return false; } /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. static bool isMOVLMask(ArrayRef Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; if (!VT.is128BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); if (!isUndefOrEqual(Mask[0], NumElts)) return false; for (unsigned i = 1; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. static bool isVPERM2X128Mask(ArrayRef Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; // The shuffle result is divided into half A and half B. In total the two // sources have 4 halves, namely: C, D, E, F. The final values of A and // B must come from C, D, E or F. unsigned HalfSize = VT.getVectorNumElements()/2; bool MatchA = false, MatchB = false; // Check if A comes from one of C, D, E, F. for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { MatchA = true; break; } } // Check if B comes from one of C, D, E, F. for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { MatchB = true; break; } } return MatchA && MatchB; } /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { MVT VT = SVOp->getSimpleValueType(0); unsigned HalfSize = VT.getVectorNumElements()/2; unsigned FstHalf = 0, SndHalf = 0; for (unsigned i = 0; i < HalfSize; ++i) { if (SVOp->getMaskElt(i) > 0) { FstHalf = SVOp->getMaskElt(i)/HalfSize; break; } } for (unsigned i = HalfSize; i < HalfSize*2; ++i) { if (SVOp->getMaskElt(i) > 0) { SndHalf = SVOp->getMaskElt(i)/HalfSize; break; } } return (FstHalf | (SndHalf << 4)); } // Symetric in-lane mask. Each lane has 4 elements (for imm8) static bool isPermImmMask(ArrayRef Mask, MVT VT, unsigned& Imm8) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize < 32) return false; unsigned NumElts = VT.getVectorNumElements(); Imm8 = 0; if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { for (unsigned i = 0; i != NumElts; ++i) { if (Mask[i] < 0) continue; Imm8 |= Mask[i] << (i*2); } return true; } unsigned LaneSize = 4; SmallVector MaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; if (Mask[i+l] < 0) continue; if (MaskVal[i] < 0) { MaskVal[i] = Mask[i+l] - l; Imm8 |= MaskVal[i] << (i*2); continue; } if (Mask[i+l] != (signed)(MaskVal[i]+l)) return false; } } return true; } /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. static bool isVPERMILPMask(ArrayRef Mask, MVT VT) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (VT.getSizeInBits() < 256 || EltSize < 32) return false; bool symetricMaskRequired = (EltSize == 32); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned LaneSize = NumElts/NumLanes; // 2 or 4 elements in one lane SmallVector ExpectedMaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; if (symetricMaskRequired) { if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { ExpectedMaskVal[i] = Mask[i+l] - l; continue; } if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) return false; } } } return true; } /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. static bool isCommutedMOVLMask(ArrayRef Mask, MVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { if (!VT.is128BitVector()) return false; unsigned NumOps = VT.getVectorNumElements(); if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; if (!isUndefOrEqual(Mask[0], 0)) return false; for (unsigned i = 1; i != NumOps; ++i) if (!(isUndefOrEqual(Mask[i], i+NumOps) || (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) return false; return true; } /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> static bool isMOVSHDUPMask(ArrayRef Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || (VT.is256BitVector() && NumElems != 8) || (VT.is512BitVector() && NumElems != 16)) return false; // "i+1" is the value the indexed mask element must have for (unsigned i = 0; i != NumElems; i += 2) if (!isUndefOrEqual(Mask[i], i+1) || !isUndefOrEqual(Mask[i+1], i+1)) return false; return true; } /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> static bool isMOVSLDUPMask(ArrayRef Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || (VT.is256BitVector() && NumElems != 8) || (VT.is512BitVector() && NumElems != 16)) return false; // "i" is the value the indexed mask element must have for (unsigned i = 0; i != NumElems; i += 2) if (!isUndefOrEqual(Mask[i], i) || !isUndefOrEqual(Mask[i+1], i)) return false; return true; } /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. static bool isMOVDDUPYMask(ArrayRef Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); if (NumElts != 4) return false; for (unsigned i = 0; i != NumElts/2; ++i) if (!isUndefOrEqual(Mask[i], 0)) return false; for (unsigned i = NumElts/2; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; } /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. static bool isMOVDDUPMask(ArrayRef Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned e = VT.getVectorNumElements() / 2; for (unsigned i = 0; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = 0; i != e; ++i) if (!isUndefOrEqual(Mask[e+i], i)) return false; return true; } /// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa(N->getOperand(1).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa(N->getOperand(2).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } bool X86::isVINSERT128Index(SDNode *N) { return isVINSERTIndex(N, 128); } bool X86::isVINSERT256Index(SDNode *N) { return isVINSERTIndex(N, 256); } bool X86::isVEXTRACT128Index(SDNode *N) { return isVEXTRACTIndex(N, 128); } bool X86::isVEXTRACT256Index(SDNode *N) { return isVEXTRACTIndex(N, 256); } /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); assert((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"); // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate // independently on 128-bit lanes. unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && "Only supports 2, 4 or 8 elements per lane"); unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; unsigned Mask = 0; for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); if (Elt < 0) continue; Elt &= NumLaneElts - 1; unsigned ShAmt = (i << Shift) % 8; Mask |= Elt << ShAmt; } return Mask; } /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); unsigned NumElts = VT.getVectorNumElements(); unsigned Mask = 0; for (unsigned l = 0; l != NumElts; l += 8) { // 8 nodes per lane, but we only care about the last 4. for (unsigned i = 0; i < 4; ++i) { int Elt = N->getMaskElt(l+i+4); if (Elt < 0) continue; Elt &= 0x3; // only 2-bits. Mask |= Elt << (i * 2); } } return Mask; } /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); unsigned NumElts = VT.getVectorNumElements(); unsigned Mask = 0; for (unsigned l = 0; l != NumElts; l += 8) { // 8 nodes per lane, but we only care about the first 4. for (unsigned i = 0; i < 4; ++i) { int Elt = N->getMaskElt(l+i); if (Elt < 0) continue; Elt &= 0x3; // only 2-bits Mask |= Elt << (i * 2); } } return Mask; } /// \brief Return the appropriate immediate to shuffle the specified /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with /// VALIGN (if Interlane is true) instructions. static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, bool InterLane) { MVT VT = SVOp->getSimpleValueType(0); unsigned EltSize = InterLane ? 1 : VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; int Val = 0; unsigned i; for (i = 0; i != NumElts; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; } if (Val >= (int)NumElts) Val -= NumElts - NumLaneElts; assert(Val - i > 0 && "PALIGNR imm should be positive"); return (Val - i) * EltSize; } /// \brief Return the appropriate immediate to shuffle the specified /// VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return getShuffleAlignrImmediate(SVOp, false); } /// \brief Return the appropriate immediate to shuffle the specified /// VECTOR_SHUFFLE mask with the VALIGN instruction. static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { return getShuffleAlignrImmediate(SVOp, true); } static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa(N->getOperand(1).getNode())) llvm_unreachable("Illegal extract subvector for VEXTRACT"); uint64_t Index = cast(N->getOperand(1).getNode())->getZExtValue(); MVT VecVT = N->getOperand(0).getSimpleValueType(); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa(N->getOperand(2).getNode())) llvm_unreachable("Illegal insert subvector for VINSERT"); uint64_t Index = cast(N->getOperand(2).getNode())->getZExtValue(); MVT VecVT = N->getSimpleValueType(0); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } /// getExtractVEXTRACT128Immediate - Return the appropriate immediate /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 /// and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } /// getExtractVEXTRACT256Immediate - Return the appropriate immediate /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 /// and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } /// getInsertVINSERT128Immediate - Return the appropriate immediate /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 /// and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } /// getInsertVINSERT256Immediate - Return the appropriate immediate /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 /// and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } /// isZero - Returns true if Elt is a constant integer zero static bool isZero(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isNullValue(); } /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { if (isZero(Elt)) return true; if (ConstantFPSDNode *CFP = dyn_cast(Elt)) return CFP->getValueAPF().isPosZero(); return false; } /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). static bool ShouldXformToMOVHLPS(ArrayRef Mask, MVT VT) { if (!VT.is128BitVector()) return false; if (VT.getVectorNumElements() != 4) return false; for (unsigned i = 0, e = 2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i+2)) return false; for (unsigned i = 2; i != 4; ++i) if (!isUndefOrEqual(Mask[i], i+4)) return false; return true; } /// isScalarLoadToVector - Returns true if the node is a scalar load that /// is promoted to a vector. It also returns the LoadSDNode by reference if /// required. static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) return false; N = N->getOperand(0).getNode(); if (!ISD::isNON_EXTLoad(N)) return false; if (LD) *LD = cast(N); return true; } // Test whether the given value is a vector value which will be legalized // into a load. static bool WillBeConstantPoolLoad(SDNode *N) { if (N->getOpcode() != ISD::BUILD_VECTOR) return false; // Check for any non-constant elements. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) switch (N->getOperand(i).getNode()->getOpcode()) { case ISD::UNDEF: case ISD::ConstantFP: case ISD::Constant: break; default: return false; } // Vectors of all-zeros and all-ones are materialized with special // instructions rather than being loaded. return !ISD::isBuildVectorAllZeros(N) && !ISD::isBuildVectorAllOnes(N); } /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to /// match movlp{s|d}. The lower half elements should come from lower half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, ArrayRef Mask, MVT VT) { if (!VT.is128BitVector()) return false; if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) return false; // Is V2 is a vector load, don't do this transformation. We will try to use // load folding shufps op. if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; } /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved /// to an zero vector. /// FIXME: move to dag combiner / method on ShuffleVectorSDNode static bool isZeroShuffle(ShuffleVectorSDNode *N) { SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); unsigned NumElems = N->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i != NumElems; ++i) { int Idx = N->getMaskElt(i); if (Idx >= (int)NumElems) { unsigned Opc = V2.getOpcode(); if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) continue; if (Opc != ISD::BUILD_VECTOR || !X86::isZeroNode(V2.getOperand(Idx-NumElems))) return false; } else if (Idx >= 0) { unsigned Opc = V1.getOpcode(); if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) continue; if (Opc != ISD::BUILD_VECTOR || !X86::isZeroNode(V1.getOperand(Idx))) return false; } } return true; } /// getZeroVector - Returns a vector of specified type with all zero elements. /// static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); SDValue Cst = DAG.getConstant(0, MVT::i1); SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, MVT::i32); SDValue Vec; if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } } else if (VT.is128BitVector()) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else llvm_unreachable("Unexpected vector type"); return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements /// that point to V2 points to its first element. static void NormalizeMask(SmallVectorImpl &Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { if (Mask[i] > (int)NumElems) { Mask[i] = NumElems; } } } /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; for (unsigned i = 0, e = NumElems/2; i != e; ++i) { Mask.push_back(i); Mask.push_back(i + NumElems); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { Mask.push_back(i + Half); Mask.push_back(i + NumElems + Half); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by // a generic shuffle instruction because the target has no such instructions. // Generate shuffles which repeat i16 and i8 several times until they can be // represented by v4f32 and then be manipulated by target suported shuffles. static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { MVT VT = V.getSimpleValueType(); int NumElems = VT.getVectorNumElements(); SDLoc dl(V); while (NumElems > 4) { if (EltNo < NumElems/2) { V = getUnpackl(DAG, dl, VT, V, V); } else { V = getUnpackh(DAG, dl, VT, V, V); EltNo -= NumElems/2; } NumElems >>= 1; } return V; } /// getLegalSplat - Generate a legal splat with supported x86 shuffles static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { MVT VT = V.getSimpleValueType(); SDLoc dl(V); if (VT.is128BitVector()) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), &SplatMask[0]); } else if (VT.is256BitVector()) { // To use VPERMILPS to splat scalars, the second half of indicies must // refer to the higher part, which is a duplication of the lower one, // because VPERMILPS can only handle in-lane permutations. int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), &SplatMask[0]); } else llvm_unreachable("Vector size not supported"); return DAG.getNode(ISD::BITCAST, dl, VT, V); } /// PromoteSplat - Splat is promoted to target supported vector shuffles. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { MVT SrcVT = SV->getSimpleValueType(0); SDValue V1 = SV->getOperand(0); SDLoc dl(SV); int EltNo = SV->getSplatIndex(); int NumElems = SrcVT.getVectorNumElements(); bool Is256BitVec = SrcVT.is256BitVector(); assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && "Unknown how to promote splat for type"); // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Is256BitVec) { V1 = Extract128BitVector(V1, EltNo, DAG, dl); if (EltNo >= NumElems/2) EltNo -= NumElems/2; } // All i16 and i8 vector types can't be used directly by a generic shuffle // instruction because the target has no such instruction. Generate shuffles // which repeat i16 and i8 several times until they fit in i32, and then can // be manipulated by target suported shuffles. MVT EltVT = SrcVT.getVectorElementType(); if (EltVT == MVT::i8 || EltVT == MVT::i16) V1 = PromoteSplati8i16(V1, DAG, EltNo); // Recreate the 256-bit vector and place the same 128-bit vector // into the low and high part. This is necessary because we want // to use VPERM* to shuffle the vectors if (Is256BitVec) { V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); } return getLegalSplat(DAG, V1, EltNo); } /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified /// vector of zero or undef vector. This produces a shuffle where the low /// element of V2 is swizzled into the zero/undef vector, landing at element /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector MaskVec; for (unsigned i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec.push_back(i == Idx ? NumElems : i); return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the /// target specific opcode. Returns true if the Mask could be calculated. Sets /// IsUnary to true if only uses one source. Note that this will set IsUnary for /// shuffles which use a single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast(ImmN)->getZExtValue(), Mask); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFB: { IsUnary = true; SDValue MaskNode = N->getOperand(1); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(0); if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. EVT VT = MaskNode.getValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); if (!VT.isInteger()) return false; int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; SmallVector RawMask; for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { RawMask.push_back((uint64_t)SM_SentinelUndef); continue; } auto *CN = dyn_cast(Op.getNode()); if (!CN) return false; APInt MaskElement = CN->getAPIntValue(); // We now have to decode the element which could be any integer size and // extract each byte of it. for (int j = 0; j < NumBytesPerElement; ++j) { // Note that this is x86 and so always little endian: the low byte is // the first byte of the mask. RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); MaskElement = MaskElement.lshr(8); } } DecodePSHUFBMask(RawMask, Mask); break; } auto *MaskLoad = dyn_cast(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); if (Mask.empty()) return false; break; } return false; } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: { // The index 0 always comes from the first element of the second source, // this is why MOVSS and MOVSD are used in the first place. The other // elements come from the other positions of the first source vector Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) { Mask.push_back(i); } break; } case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); break; case X86ISD::MOVDDUP: case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); } // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); return true; } /// getShuffleScalarElt - Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector ShuffleMask; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt < 0) return DAG.getUNDEF(ShufVT.getVectorElementType()); SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. /// We count undefs as zeros until PreferredNum is reached. static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, bool ZerosFromLeft, SelectionDAG &DAG, unsigned PreferredNum = -1U) { unsigned NumZeros = 0; for (unsigned i = 0; i != NumElems; ++i) { unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); if (!Elt.getNode()) break; if (X86::isZeroNode(Elt)) ++NumZeros; else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. NumZeros = std::min(NumZeros + 1, PreferredNum); else break; } return NumZeros; } /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) /// correspond consecutively to elements from one of the vector operands, /// starting from its index OpIdx. Also tell OpNum which source vector operand. static bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, unsigned MaskI, unsigned MaskE, unsigned OpIdx, unsigned NumElems, unsigned &OpNum) { bool SeenV1 = false; bool SeenV2 = false; for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { int Idx = SVOp->getMaskElt(i); // Ignore undef indicies if (Idx < 0) continue; if (Idx < (int)NumElems) SeenV1 = true; else SeenV2 = true; // Only accept consecutive elements from the same vector if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) return false; } OpNum = SeenV1 ? 0 : 1; return true; } /// isVectorShiftRight - Returns true if the shuffle can be implemented as a /// logical left shift of a vector. static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getSimpleValueType(0).getVectorNumElements(); unsigned NumZeros = getNumOfConsecutiveZeros( SVOp, NumElems, false /* check zeros from right */, DAG, SVOp->getMaskElt(0)); unsigned OpSrc; if (!NumZeros) return false; // Considering the elements in the mask that are not consecutive zeros, // check if they consecutively come from only one of the source vectors. // // V1 = {X, A, B, C} 0 // \ \ \ / // vector_shuffle V1, V2 <1, 2, 3, X> // if (!isShuffleMaskConsecutive(SVOp, 0, // Mask Start Index NumElems-NumZeros, // Mask End Index(exclusive) NumZeros, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? return false; isLeft = false; ShAmt = NumZeros; ShVal = SVOp->getOperand(OpSrc); return true; } /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a /// logical left shift of a vector. static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getSimpleValueType(0).getVectorNumElements(); unsigned NumZeros = getNumOfConsecutiveZeros( SVOp, NumElems, true /* check zeros from left */, DAG, NumElems - SVOp->getMaskElt(NumElems - 1) - 1); unsigned OpSrc; if (!NumZeros) return false; // Considering the elements in the mask that are not consecutive zeros, // check if they consecutively come from only one of the source vectors. // // 0 { A, B, X, X } = V2 // / \ / / // vector_shuffle V1, V2 // if (!isShuffleMaskConsecutive(SVOp, NumZeros, // Mask Start Index NumElems, // Mask End Index(exclusive) 0, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? return false; isLeft = true; ShAmt = NumZeros; ShVal = SVOp->getOperand(OpSrc); return true; } /// isVectorShift - Returns true if the shuffle can be implemented as a /// logical left or right shift of a vector. static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { // Although the logic below support any bitwidth size, there are no // shift instructions which handle more than 128-bit vectors. if (!SVOp->getSimpleValueType(0).is128BitVector()) return false; if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) return true; return false; } /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. /// static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } if ((i & 1) != 0) { SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i-1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, DAG.getConstant(8, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; if (ThisElt.getNode()) V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, DAG.getIntPtrConstant(i/2)); } } return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); } /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. /// static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { if (First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Op.getOperand(i), DAG.getIntPtrConstant(i)); } } return V; } /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { // Find all zeroable elements. bool Zeroable[4]; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); } assert(std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i=0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op->getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = cast(Elt.getOperand(1))->getZExtValue(); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget->hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = SrcVector == V1 && cast(Current.getOperand(1))->getZExtValue() == i; } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = 0; for (int i = 0; i < 4; ++i) if (Zeroable[i]) ZMask |= 1 << i; unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask)); return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); } /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, DAG.getConstant(NumBits, TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || LD->isVolatile()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI->isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI->setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~(RequiredAlign-1); if (StartOffset) Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); SmallVector Mask; for (unsigned i = 0; i != NumElems; ++i) Mask.push_back(EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); } /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a /// vector of type 'VT', see if the elements can be replaced by a single large /// load which has the same value as a build_vector whose operands are 'elts'. /// /// Example: -> zextload a /// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. // If we don't find an initial load element, or later load elements are // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); if (!LDBase) { if (Elt.getNode()->getOpcode() == ISD::UNDEF) return SDValue(); LDBase = cast(Elt.getNode()); LastLoadedElt = i; continue; } if (Elt.getOpcode() == ISD::UNDEF) continue; LoadSDNode *LD = cast(Elt); if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) return SDValue(); LastLoadedElt = i; } // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) return SDValue(); SDValue NewLd = SDValue(); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); } return NewLd; } //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); // Make sure the newly-created LOAD is in the same position as LDBase in // terms of dependency. We create a TokenFactor for LDBase and ResNode, and // update uses of LDBase's output chain to use the TokenFactor. if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); } return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); } /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction /// to generate a splat value for the following cases: /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; bool ConstSplatVal; switch (Op.getOpcode()) { default: // Unknown pattern found. return SDValue(); case ISD::BUILD_VECTOR: { auto *BVOp = cast(Op.getNode()); BitVector UndefElements; SDValue Splat = BVOp->getSplatValue(&UndefElements); // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } case ISD::VECTOR_SHUFFLE: { ShuffleVectorSDNode *SVOp = cast(Op); // Shuffles must have a splat mask where the first element is // broadcasted. if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) return SDValue(); SDValue Sc = Op.getOperand(0); if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && Sc.getOpcode() != ISD::BUILD_VECTOR) { if (!Subtarget->hasInt256()) return SDValue(); // Use the register form of the broadcast instruction available on AVX2. if (VT.getSizeInBits() >= 256) Sc = Extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. // AVX-512 has register version of the broadcast bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && Ld.getValueType().getSizeInBits() >= 32; if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && !hasRegVer)) return SDValue(); break; } } unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. const Function *F = DAG.getMachineFunction().getFunction(); bool OptForSize = F->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget->hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast(ExtIdx)->getZExtValue(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %vreg1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %vreg0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { unsigned Idx = InsertIndices[i]; NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx)); } return NV; } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, MVT::i1); SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, MVT::i1); SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } bool AllContants = true; uint64_t Immediate = 0; int NonConstIdx = -1; bool IsSplat = true; unsigned NumNonConsts = 0; unsigned NumConsts = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; if (!isa(In)) { AllContants = false; NonConstIdx = idx; NumNonConsts++; } else { NumConsts++; if (cast(In)->getZExtValue()) Immediate |= (1ULL << idx); } if (In != Op.getOperand(0)) IsSplat = false; } if (AllContants) { SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, DAG.getConstant(Immediate, MVT::i16)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, DAG.getIntPtrConstant(0)); } if (NumNonConsts == 1 && NonConstIdx != 0) { SDValue DstVec; if (NumConsts) { SDValue VecAsImm = DAG.getConstant(Immediate, MVT::getIntegerVT(VT.getSizeInBits())); DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); } else DstVec = DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(NonConstIdx), DAG.getIntPtrConstant(NonConstIdx)); } if (!IsSplat && (NonConstIdx != 0)) llvm_unreachable("Unsupported BUILD_VECTOR operation"); MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; SDValue Select; if (IsSplat) Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), DAG.getConstant(-1, SelectVT), DAG.getConstant(0, SelectVT)); else Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), DAG.getConstant((Immediate | 1), SelectVT), DAG.getConstant(Immediate, SelectVT)); return DAG.getNode(ISD::BITCAST, dl, VT, Select); } /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->getOpcode() == ISD::UNDEF) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.getOpcode() == ISD::UNDEF) V0 = Op0.getOperand(0); } else { if (V1.getOpcode() == ISD::UNDEF) V1 = Op0.getOperand(0); if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || V1_LO->getOpcode() != ISD::UNDEF)) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || V1_HI->getOpcode() != ISD::UNDEF)) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// \brief Try to fold a build_vector that performs an 'addsub' into the /// sequence of 'vadd + vsub + blendi'. static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(BV); EVT VT = BV->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting two integer/float elements. unsigned ExpectedOpcode = ISD::FSUB; unsigned NextExpectedOpcode = ISD::FADD; bool AddFound = false; bool SubFound = false; for (unsigned i = 0, e = NumElts; i != e; i++) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return SDValue(); unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return SDValue(); // We found a valid add/sub node. Update the information accordingly. if (i & 1) AddFound = true; else SubFound = true; // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) InVec0 = Op0.getOperand(0); if (InVec1.getOpcode() == ISD::UNDEF) InVec1 = Op1.getOperand(0); // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) return SDValue(); // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return SDValue(); } if (InVec1 != Op1.getOperand(0)) return SDValue(); // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && InVec1.getOpcode() != ISD::UNDEF) return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); return SDValue(); } static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(N); EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); BuildVectorSDNode *BV = cast(N); SDValue InVec0, InVec1; // Try to match an ADDSUB. if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { SDValue Value = matchAddSub(BV, DAG, Subtarget); if (Value.getNode()) return Value; } // Try to match horizontal ADD/SUB. unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; unsigned Half = NumElts/2; // Count the number of UNDEF operands in the build_vector in input. for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsHI++; // Early exit if this is either a build_vector of all UNDEFs or all the // operands but one are UNDEF. if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) return SDValue(); if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } if (!Subtarget->hasAVX()) return SDValue(); if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; if (!VT.is512BitVector()) return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; bool IsAllConstants = true; SmallSet Values; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() == ISD::UNDEF) continue; Values.insert(Elt); if (Elt.getOpcode() != ISD::Constant && Elt.getOpcode() != ISD::ConstantFP) IsAllConstants = false; if (X86::isZeroNode(Elt)) NumZero++; else { NonZeros |= (1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of // the value are obviously zero, truncate the value to i32 and do the // insertion that way. Only do this if the value is non-constant or if the // value is a constant being inserted into element 0. It is cheaper to do // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); EVT VecVT = MVT::v4i32; unsigned VecElts = 4; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); // If using the new shuffle lowering, just directly insert this. if (ExperimentalVectorShuffleLowering) return DAG.getNode( ISD::BITCAST, dl, VT, getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. if (Idx != 0) { SmallVector Mask; Mask.push_back(Idx); for (unsigned i = 1; i != VecElts; ++i) Mask.push_back(i); Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), &Mask[0]); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { if (VT.is256BitVector() || VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); } assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); if (VT.is256BitVector()) { SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // If using the new shuffle lowering, just directly insert this. if (ExperimentalVectorShuffleLowering) return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector MaskVec; for (unsigned i = 0; i != NumElems; ++i) MaskVec.push_back(i == Idx ? 0 : 1); return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); // For AVX-length vectors, see if we can use a vector load to get all of the // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { SmallVector V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); // Check for a build vector of consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[0], NumElems/2)); SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) { SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, Subtarget, *this); if (V.getNode()) return V; } if (EVTBits == 16 && NumElems == 8) { SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) { SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { default: break; case 0: V[i] = V[i*2]; // Must be a zero vector. break; case 1: V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); break; case 2: V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); break; case 3: V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } if (Values.size() > 1 && VT.is128BitVector()) { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); if (LD.getNode()) return LD; // Check for a build vector from mostly shuffle plus few inserting. SDValue Sh = buildFromShuffleMostly(Op, DAG); if (Sh.getNode()) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). for (unsigned i = 0; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() != ISD::UNDEF) V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else V[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 2 ==> X: // : unpcklps 1, 3 ==> Y: // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> unsigned EltStride = NumElems >> 1; while (EltStride != 0) { for (unsigned i = 0; i < EltStride; ++i) { // If V[i+EltStride] is undef and this is the first round of mixing, // then it is safe to just drop this shuffle: V[i] is already in the // right place, the one element (since it's the first round) being // inserted as undef can be dropped. This isn't safe for successive // rounds because they will permute elements within both vectors. if (V[i+EltStride].getOpcode() == ISD::UNDEF && EltStride == NumElems/2) continue; V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); } EltStride >>= 1; } return V[0]; } return SDValue(); } // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); if(ResVT.is256BitVector()) return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); } return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// \brief Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] != -1 && Mask[i] != i) return false; return true; } /// \brief Helper function to classify a mask as a single-input mask. /// /// This isn't a generic single-input test because in the vector shuffle /// lowering we canonicalize single inputs to be the first input operand. This /// means we can more quickly test for a single input by only checking whether /// an input from the second operand exists. We also assume that the size of /// mask corresponds to the size of the input vectors which isn't true in the /// fully general case. static bool isSingleInputShuffleMask(ArrayRef Mask) { for (int M : Mask) if (M >= (int)Mask.size()) return false; return true; } /// \brief Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { int LaneSize = 128 / VT.getScalarSizeInBits(); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. /// /// This checks a shuffle mask to see if it is performing the same /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// *not* suitable for use with existing 128-bit shuffles as it will contain /// entries from both V1 and V2 inputs to the wider mask. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = 128 / VT.getScalarSizeInBits(); RepeatedMask.resize(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. if (RepeatedMask[i % LaneSize] == -1) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) // Found a mismatch with the repeated mask. return false; } return true; } // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC // 2013 will allow us to use it as a non-type template parameter. namespace { /// \brief Implementation of the \c isShuffleEquivalent variadic functor. /// /// See its documentation for details. bool isShuffleEquivalentImpl(ArrayRef Mask, ArrayRef Args) { if (Mask.size() != Args.size()) return false; for (int i = 0, e = Mask.size(); i < e; ++i) { assert(*Args[i] >= 0 && "Arguments must be positive integers!"); if (Mask[i] != -1 && Mask[i] != *Args[i]) return false; } return true; } } // namespace /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static const VariadicFunction1< bool, ArrayRef, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, SelectionDAG &DAG) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; return DAG.getConstant(Imm, MVT::i8); } /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is in fact a blend. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= Size) { if (Mask[i] != i + Size) return SDValue(); // Shuffled V2 input! BlendMask |= 1u << i; continue; } if (Mask[i] >= 0 && Mask[i] != i) return SDValue(); // Shuffled V1 input! } switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: case MVT::v4f64: case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); // FALLTHROUGH case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into // that instruction. if (Subtarget->hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= Size) for (int j = 0; j < Scale; ++j) BlendMask |= 1u << (i * Scale + j); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, DAG.getConstant(BlendMask, MVT::i8))); } // FALLTHROUGH case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= Size) for (int j = 0; j < Scale; ++j) BlendMask |= 1u << (i * Scale + j); V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, DAG.getConstant(BlendMask, MVT::i8))); } case MVT::v16i16: { assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 16) BlendMask |= 1u << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, MVT::i8)); } } // FALLTHROUGH case MVT::v32i8: { assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SDValue VSELECTMask[32]; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask[Scale * i + j] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); return DAG.getNode( ISD::BITCAST, DL, VT, DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), V1, V2)); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// \brief Generic routine to lower a shuffle and blend as a decomposed set of /// unblended shuffles followed by an unshuffled blend. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector V1Mask(Mask.size(), -1); SmallVector V2Mask(Mask.size(), -1); SmallVector BlendMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && Mask[i] < Size) { V1Mask[i] = Mask[i]; BlendMask[i] = i; } else if (Mask[i] >= Size) { V2Mask[i] = Mask[i] - Size; BlendMask[i] = i + Size; } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } /// \brief Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. /// /// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] == -1) continue; assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); // Based on the mod-Size value of this mask element determine where // a rotated vector would have started. int StartIdx = i - (Mask[i] % Size); if (StartIdx == 0) // The identity rotation isn't interesting, stop. return SDValue(); // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the head. int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return SDValue(); // Compute which value this mask is pointing at. SDValue MaskV = Mask[i] < Size ? V1 : V2; // Compute which of the two target values this index should be assigned to. // This reflects whether the high elements are remaining or the low elements // are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return SDValue(); } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; assert(VT.getSizeInBits() == 128 && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); // The actual rotate instruction rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector. int Scale = 16 / Mask.size(); // SSSE3 targets can use the palignr instruction if (Subtarget->hasSSSE3()) { // Cast the inputs to v16i8 to match PALIGNR. Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, DAG.getConstant(Rotation * Scale, MVT::i8))); } // Default SSE2 implementation int LoByteShift = 16 - Rotation * Scale; int HiByteShift = Rotation * Scale; // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, DAG.getConstant(8 * LoByteShift, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, DAG.getConstant(8 * HiByteShift, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } /// \brief Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { Zeroable[i] = true; continue; } // If this is an index into a build_vector node, dig out the input value and // use it. SDValue V = M < Size ? V1 : V2; if (V.getOpcode() != ISD::BUILD_VECTOR) continue; SDValue Input = V.getOperand(M % Size); // The UNDEF opcode check really should be dead code here, but not quite // worth asserting on (it isn't invalid, just unexpected). if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) Zeroable[i] = true; } return Zeroable; } /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 /// byte-shift instructions. The mask must consist of a shifted sequential /// shuffle from one of the input vectors and zeroable elements for the /// remaining 'shifted in' elements. /// /// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Size = Mask.size(); int Scale = 16 / Size; for (int Shift = 1; Shift < Size; Shift++) { int ByteShift = Shift * Scale; // PSRLDQ : (little-endian) right byte shift // [ 5, 6, 7, zz, zz, zz, zz, zz] // [ -1, 5, 6, 7, zz, zz, zz, zz] // [ 1, 2, -1, -1, -1, -1, zz, zz] bool ZeroableRight = true; for (int i = Size - Shift; i < Size; i++) { ZeroableRight &= Zeroable[i]; } if (ZeroableRight) { bool ValidShiftRight1 = isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); bool ValidShiftRight2 = isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); if (ValidShiftRight1 || ValidShiftRight2) { // Cast the inputs to v2i64 to match PSRLDQ. SDValue &TargetV = ValidShiftRight1 ? V1 : V2; SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, DAG.getConstant(ByteShift * 8, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); } } // PSLLDQ : (little-endian) left byte shift // [ zz, 0, 1, 2, 3, 4, 5, 6] // [ zz, zz, -1, -1, 2, 3, 4, -1] // [ zz, zz, zz, zz, zz, zz, -1, 1] bool ZeroableLeft = true; for (int i = 0; i < Shift; i++) { ZeroableLeft &= Zeroable[i]; } if (ZeroableLeft) { bool ValidShiftLeft1 = isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); bool ValidShiftLeft2 = isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); if (ValidShiftLeft1 || ValidShiftLeft2) { // Cast the inputs to v2i64 to match PSLLDQ. SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, DAG.getConstant(ByteShift * 8, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); } } } return SDValue(); } /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getSizeInBits() / NumElements; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); } // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {0, -1, 1, -1}; return DAG.getNode( ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {0, -1, 0, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); int PSHUFHWMask[4] = {1, -1, -1, -1}; return DAG.getNode( ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) PSHUFBMask[i] = DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8); InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask))); } // Otherwise emit a sequence of unpacks. do { MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getNode(ISD::BITCAST, DL, VT, InputV); } /// \brief Try to lower a vector shuffle as a zero extension on any micrarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); int NumElements = Mask.size(); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; for (int i = 0; i < NumElements; ++i) { if (Mask[i] == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extend elements needs to be zeroable. if (!Zeroable[i]) return SDValue(); // We no lorger are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = Mask[i] < NumElements ? V1 : V2; if (!InputV) InputV = V; else if (InputV != V) return SDValue(); // Flip-flopping inputs. if (Mask[i] % NumElements != i / Scale) return SDValue(); // Non-consecutive strided elemenst. } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisble by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // No viable ext lowering found. return SDValue(); } /// \brief Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); return SDValue(); } /// \brief Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return ISD::isNON_EXTLoad(V.getNode()); } /// \brief Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); int V2Index = std::find_if(Mask.begin(), Mask.end(), [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. if (SDValue V2S = getScalarValueForVectorElement( V2, Mask[V2Index] - Mask.size(), DAG)) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::v4i32; V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply, and // the V1 elements can't be permuted in any way. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); SmallVector V1Mask(Mask.begin(), Mask.end()); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); // This is essentially a special case blend operation, but if we have // general purpose blend operations, they are always faster. Bail and let // the rest of the lowering handle these as blends. if (Subtarget->hasSSE41()) return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"); return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, ExtVT, V1, V2); } V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, DAG.getConstant( V2Index * EltVT.getSizeInBits(), DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); } } return V2; } /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); if (VT.isInteger() && !Subtarget->hasAVX2()) return SDValue(); // Check that the mask is a broadcast. int BroadcastIdx = -1; for (int M : Mask) if (M >= 0 && BroadcastIdx == -1) BroadcastIdx = M; else if (M >= 0 && M != BroadcastIdx) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to try and find a scalar load that // we can combine with the broadcast. for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { int OperandSize = Mask.size() / V.getNumOperands(); V = V.getOperand(BroadcastIdx / OperandSize); BroadcastIdx %= OperandSize; continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast(V.getOperand(2)); if (!ConstantIdx) break; int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = BeginIdx + (int)VInner.getValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; } else { V = VOuter; } continue; } } break; } // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); // If the scalar isn't a load we can't broadcast from it in AVX1, only with // AVX2. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register w/o AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); unsigned ZMask = 0; int V1DstIndex = -1; int V2DstIndex = -1; bool V1UsedInPlace = false; for (int i = 0; i < 4; i++) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any V1 inputs in place. if (i == Mask[i]) { V1UsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (V1DstIndex != -1 || V2DstIndex != -1) return SDValue(); if (Mask[i] < 4) { // V1 input out of place for insertion. V1DstIndex = i; } else { // V2 input for insertion. V2DstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (V1DstIndex == -1 && V2DstIndex == -1) return SDValue(); // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned V2SrcIndex = 0; if (V1DstIndex != -1) { // If we have a V1 input out of place, we use V1 as the V2 element insertion // and don't use the original V2 at all. V2SrcIndex = Mask[V1DstIndex]; V2DstIndex = V1DstIndex; V2 = V1; } else { V2SrcIndex = Mask[V2DstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!V1UsedInPlace) V1 = DAG.getUNDEF(MVT::v4f32); unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); // Insert the V2 element into the desired position. SDLoc DL(Op); return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getConstant(InsertPSMask, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 2)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); if (isShuffleEquivalent(Mask, 1, 3)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; } // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Blend; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getNode( ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v2i64, V1, V2, Mask, DAG)) return Shift; // If we have a single input from V2 insert that into V1 if we can do so // cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; } // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 2)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); if (isShuffleEquivalent(Mask, 1, 3)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] == -1) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } /// \brief Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, Mask, Subtarget, DAG)) return Broadcast; if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); } // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, Mask, Subtarget, DAG)) return V; if (Subtarget->hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) return V; } // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return ZExt; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) Mask = UnpackLoMask; else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); } // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v4i32, V1, V2, Mask, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) return V; // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, DAG.getVectorShuffle( MVT::v4f32, DL, DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. static SDValue lowerV8I16SingleInputVectorShuffle( SDLoc DL, SDValue V, MutableArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); SmallVector LoInputs; std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, Mask, Subtarget, DAG)) return Broadcast; // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v8i16, V, V, Mask, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) return Rotate; // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; ArrayRef TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); int NumFlippedBToBInputs = std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), PinnedIdx ^ 1) != Inputs.end(); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), FixFreeIdx) != Inputs.end(); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), FixFreeIdx) != Inputs.end(); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); for (int &M : Mask) if (M != -1 && M == FixIdx) M = FixFreeIdx; else if (M != -1 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M != -1 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M != -1 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), Mask); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, MutableArrayRef FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = std::find(std::begin(SourceHalfMask), std::end(SourceHalfMask), -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] == -1) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] == -1) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(LoMask, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(HiMask, DAG)); return V; } /// \brief Detect whether the mask pattern should be lowered through /// interleaving. /// /// This essentially tests whether viewing the mask as an interleaving of two /// sub-sequences reduces the cross-input traffic of a blend operation. If so, /// lowering it through interleaving is a significantly better strategy. static bool shouldLowerAsInterleaving(ArrayRef Mask) { int NumEvenInputs[2] = {0, 0}; int NumOddInputs[2] = {0, 0}; int NumLoInputs[2] = {0, 0}; int NumHiInputs[2] = {0, 0}; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; int InputIdx = Mask[i] >= Size; if (i < Size / 2) ++NumLoInputs[InputIdx]; else ++NumHiInputs[InputIdx]; if ((i % 2) == 0) ++NumEvenInputs[InputIdx]; else ++NumOddInputs[InputIdx]; } // The minimum number of cross-input results for both the interleaved and // split cases. If interleaving results in fewer cross-input results, return // true. int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], NumEvenInputs[0] + NumOddInputs[1]); int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], NumLoInputs[0] + NumHiInputs[1]); return InterleavedCrosses < SplitCrosses; } /// \brief Blend two v8i16 vectors using a naive unpack strategy. /// /// This strategy only works when the inputs from each vector fit into a single /// half of that vector, and generally there are not so many inputs as to leave /// the in-place shuffles required highly constrained (and thus expensive). It /// shifts all the inputs into a single side of both input vectors and then /// uses an unpack to interleave these inputs in a single vector. At that /// point, we will fall back on the generic single input shuffle lowering. static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, SDValue V2, MutableArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); SmallVector LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; for (int i = 0; i < 8; ++i) if (Mask[i] >= 0 && Mask[i] < 4) LoV1Inputs.push_back(i); else if (Mask[i] >= 4 && Mask[i] < 8) HiV1Inputs.push_back(i); else if (Mask[i] >= 8 && Mask[i] < 12) LoV2Inputs.push_back(i); else if (Mask[i] >= 12) HiV2Inputs.push_back(i); int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); (void)NumV1Inputs; (void)NumV2Inputs; assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= HiV1Inputs.size() + HiV2Inputs.size(); auto moveInputsToHalf = [&](SDValue V, ArrayRef LoInputs, ArrayRef HiInputs, bool MoveToLo, int MaskOffset) { ArrayRef GoodInputs = MoveToLo ? LoInputs : HiInputs; ArrayRef BadInputs = MoveToLo ? HiInputs : LoInputs; if (BadInputs.empty()) return V; int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; int MoveOffset = MoveToLo ? 0 : 4; if (GoodInputs.empty()) { for (int BadInput : BadInputs) { MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; } } else { if (GoodInputs.size() == 2) { // If the low inputs are spread across two dwords, pack them into // a single dword. MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; Mask[GoodInputs[0]] = MoveOffset + MaskOffset; Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; } else { // Otherwise pin the good inputs. for (int GoodInput : GoodInputs) MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; } if (BadInputs.size() == 2) { // If we have two bad inputs then there may be either one or two good // inputs fixed in place. Find a fixed input, and then find the *other* // two adjacent indices by using modular arithmetic. int GoodMaskIdx = std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), [](int M) { return M >= 0; }) - std::begin(MoveMask); int MoveMaskIdx = ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; } else { assert(BadInputs.size() == 1 && "All sizes handled"); int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - std::begin(MoveMask); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; } } return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), MoveMask); }; V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, /*MaskOffset*/ 0); V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, /*MaskOffset*/ 8); // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes // cross-half traffic in the final shuffle. // Munge the mask to be a single-input mask after the unpack merges the // results. for (int &M : Mask) if (M != -1) M = 2 * (M % 4) + (M / 8); return DAG.getVectorShuffle( MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2), DAG.getUNDEF(MVT::v8i16), Mask); } /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef OrigMask = SVOp->getMask(); int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; MutableArrayRef Mask(MaskStorage); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) return ZExt; auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); if (NumV2Inputs == 0) return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG)) return V; // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Blend; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); // Check whether an interleaving lowering is likely to be more efficient. // This isn't perfect but it is a strong heuristic that tends to work well on // the kinds of shuffles that show up in practice. // // FIXME: Handle 1x, 2x, and 4x interleaving. if (shouldLowerAsInterleaving(Mask)) { // FIXME: Figure out whether we should pack these into the low or high // halves. int EMask[8], OMask[8]; for (int i = 0; i < 4; ++i) { EMask[i] = Mask[2*i]; OMask[i] = Mask[2*i + 1]; EMask[i + 4] = -1; OMask[i + 4] = -1; } SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); } int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 4; ++i) { LoBlendMask[i] = Mask[i]; HiBlendMask[i] = Mask[i + 4]; } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); } /// \brief Check whether a compaction lowering can be done by dropping even /// elements and compute how many times even elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even elements must be dropped if /// there is such a number. Otherwise returns zero. static int canLowerByDroppingEvenElements(ArrayRef Mask) { // Figure out whether we're looping over two inputs or just one. bool IsSingleInput = isSingleInputShuffleMask(Mask); // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] == -1) continue; bool IsAnyViable = false; for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use byte shift instructions. if (SDValue Shift = lowerVectorShuffleAsByteShift( DL, MVT::v16i8, V1, V2, OrigMask, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) return Rotate; // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) return ZExt; int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; MutableArrayRef Mask(MaskStorage); MutableArrayRef LoMask = Mask.slice(0, 8); MutableArrayRef HiMask = Mask.slice(8, 8); int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { for (int i = 0; i < 16; i += 2) if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) return false; return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector LoInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] != -1) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] != -1) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); if (PostDupI16Shuffle[i / 2] == -1) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"); } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } // Check whether an interleaving lowering is likely to be more efficient. // This isn't perfect but it is a strong heuristic that tends to work well on // the kinds of shuffles that show up in practice. // // FIXME: We need to handle other interleaving widths (i16, i32, ...). if (shouldLowerAsInterleaving(Mask)) { int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { return (M >= 0 && M < 8) || (M >= 16 && M < 24); }); int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { return (M >= 8 && M < 16) || M >= 24; }); int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; bool UnpackLo = NumLoHalf >= NumHiHalf; MutableArrayRef TargetEMask(UnpackLo ? EMask : EMask + 8, 8); MutableArrayRef TargetOMask(UnpackLo ? OMask : OMask + 8, 8); for (int i = 0; i < 8; ++i) { TargetEMask[i] = Mask[2 * i]; TargetOMask[i] = Mask[2 * i + 1]; } SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, Evens, Odds); } // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input // lowerings can find an instruction sequence that is faster than a PSHUFB, we // want to preserve that and we can DAG combine any longer sequences into // a PSHUFB in the end. But once we start blending from multiple inputs, // the complexity of DAG combining bad patterns back into PSHUFB is too high, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget->hasSSSE3()) { SDValue V1Mask[16]; SDValue V2Mask[16]; bool V1InUse = false; bool V2InUse = false; SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { const int ZeroMask = 0x80; int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask); int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16); if (Zeroable[i]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } } if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); // If we need shuffled inputs from both, blend the two. if (V1InUse && V2InUse) return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); if (V1InUse) return V1; // Single inputs are easy. if (V2InUse) return V2; // Single inputs are easy. // Shuffling to a zeroable vector. return getZeroVector(MVT::v16i8, Subtarget, DAG, DL); } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG)) return V; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. // // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. bool IsSingleInput = isSingleInputShuffleMask(Mask); // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); // We use the mask type to pick which bytes are preserved based on how many // elements are dropped. MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; SDValue ByteClearMask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); // Now pack things back together. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } return Result; } int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; auto buildBlendMasks = [](MutableArrayRef HalfMask, MutableArrayRef V1HalfBlendMask, MutableArrayRef V2HalfBlendMask) { for (int i = 0; i < 8; ++i) if (HalfMask[i] >= 0 && HalfMask[i] < 16) { V1HalfBlendMask[i] = HalfMask[i]; HalfMask[i] = i; } else if (HalfMask[i] >= 16) { V2HalfBlendMask[i] = HalfMask[i] - 16; HalfMask[i] = i + 8; } }; buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef LoBlendMask, MutableArrayRef HiBlendMask) { SDValue V1, V2; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), [](int M) { return M >= 0 && M % 2 == 1; }) && std::none_of(HiBlendMask.begin(), HiBlendMask.end(), [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, DAG.getConstant(0x00FF, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke V2. V2 = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into V1. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into V1 and the high half into // V2 so that we can blend them as i16s. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); return std::make_pair(BlendedLo, BlendedHi); }; SDValue V1Lo, V1Hi, V2Lo, V2Hi; std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// \brief Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { for (int i = 0, Size = Mask.size(); i < Size; i += 2) { // If both elements are undef, its trivial. if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { WidenedMask.push_back(SM_SentinelUndef); continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { WidenedMask.push_back(Mask[i + 1] / 2); continue; } if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { WidenedMask.push_back(Mask[i] / 2); continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { WidenedMask.push_back(SM_SentinelZero); continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { WidenedMask.push_back(Mask[i] / 2); continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } /// \brief Generic routine to split ector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getScalarType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, DAG.getIntPtrConstant(SplitNumElements)); SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, DAG.getIntPtrConstant(0)); SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, DAG.getIntPtrConstant(SplitNumElements)); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; SmallVector V1BlendMask, V2BlendMask, BlendMask; for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { if (M >= NumElements + SplitNumElements) UseHiV2 = true; else UseLoV2 = true; V2BlendMask.push_back(M - NumElements); V1BlendMask.push_back(-1); BlendMask.push_back(SplitNumElements + i); } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; V2BlendMask.push_back(-1); V1BlendMask.push_back(M); BlendMask.push_back(i); } else { V2BlendMask.push_back(-1); V1BlendMask.push_back(-1); BlendMask.push_back(-1); } } // Because the lowering happens after all combining takes place, we need to // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); if (!UseLoV2 && !UseHiV2) return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); if (!UseLoV1 && !UseHiV1) return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); SDValue V1Blend, V2Blend; if (UseLoV1 && UseHiV1) { V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); } else { // We only use half of V1 so map the usage down into the final blend mask. V1Blend = UseLoV1 ? LoV1 : HiV1; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); } if (UseLoV2 && UseHiV2) { V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); } else { // We only use half of V2 so map the usage down into the final blend mask. V2Blend = UseLoV2 ? LoV2 : HiV2; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= SplitNumElements) BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); } return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } /// \brief Either split a vector in halves or decompose the shuffles and the /// blend. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " "lower single-input shuffles as it " "could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, // prefer that lowering. This is especially important because broadcasts can // often fold with memory operands. auto DoBothBroadcast = [&] { int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { if (V2BroadcastIdx == -1) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { if (V1BroadcastIdx == -1) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; } return true; }; if (DoBothBroadcast()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to // unusually few instructions. int LaneCount = VT.getSizeInBits() / 128; int LaneSize = Size / LaneCount; SmallBitVector LaneInputs[2]; LaneInputs[0].resize(LaneCount, false); LaneInputs[1].resize(LaneCount, false); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// /// This essentially blends the out-of-lane inputs to each lane into the lane /// from a permuted copy of the vector. This lowering strategy results in four /// instructions in the worst case for a single-input cross lane shuffle which /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track wether the given lane contains an element // that crosses to another lane. bool LaneCrossing[2] = {false, false}; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); if (isSingleInputShuffleMask(Mask)) { SmallVector FlippedBlendMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) FlippedBlendMask.push_back( Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) ? Mask[i] : Mask[i] % LaneSize + (i / LaneSize) * LaneSize + Size)); // Flip the vector, and blend the results which should now be in-lane. The // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and // 5 for the high source. The value 3 selects the high half of source 2 and // the value 2 selects the low half of source 2. We only use source 2 to // allow folding it into a memory operand. unsigned PERMMask = 3 | 2 << 4; SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), V1, DAG.getConstant(PERMMask, MVT::i8)); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } // This now reduces to two single-input shuffles of V1 and V2 which at worst // will be handled by the above logic and a blend of the results, much like // other patterns in AVX. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Blend; MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); // Check for patterns which can be matched with a single insert of a 128-bit // subvector. if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || isShuffleEquivalent(Mask, 0, 1, 4, 5)) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, DAG.getIntPtrConstant(2)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } // Otherwise form a 128-bit permutation. // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, MVT::i8)); } /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// /// This will only succeed when the result of fixing the 128-bit lanes results /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in /// each 128-bit lanes. This handles many cases where we can quickly blend away /// the lane crosses early and then use simpler shuffles within each lane. /// /// FIXME: It might be worthwhile at some point to support this without /// requiring the 128-bit lane-relative shuffles to be repeating, but currently /// in x86 only floating point has interesting non-repeating shuffles, and even /// those are still *marginally* more expensive. static SDValue lowerVectorShuffleByMerging128BitLanes( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(!isSingleInputShuffleMask(Mask) && "This is only useful with multiple inputs."); int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); int NumLanes = Size / LaneSize; assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also // check whether the in-128-bit lane shuffles share a repeating pattern. SmallVector Lanes; Lanes.resize(NumLanes, -1); SmallVector InLaneMask; InLaneMask.resize(LaneSize, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; int j = i / LaneSize; if (Lanes[j] < 0) { // First entry we've seen for this lane. Lanes[j] = Mask[i] / LaneSize; } else if (Lanes[j] != Mask[i] / LaneSize) { // This doesn't match the lane selected previously! return SDValue(); } // Check that within each lane we have a consistent shuffle mask. int k = i % LaneSize; if (InLaneMask[k] < 0) { InLaneMask[k] = Mask[i] % LaneSize; } else if (InLaneMask[k] != Mask[i] % LaneSize) { // This doesn't fit a repeating in-lane mask. return SDValue(); } } // First shuffle the lanes into place. MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, VT.getSizeInBits() / 64); SmallVector LaneMask; LaneMask.resize(NumLanes * 2, -1); for (int i = 0; i < NumLanes; ++i) if (Lanes[i] >= 0) { LaneMask[2 * i + 0] = 2*Lanes[i] + 0; LaneMask[2 * i + 1] = 2*Lanes[i] + 1; } V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1); V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2); SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); // Cast it back to the type we actually want. LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle); // Now do a simple shuffle that isn't lane crossing. SmallVector NewMask; NewMask.resize(Size, -1); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"); return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SmallVector WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, Mask, Subtarget, DAG)) return Broadcast; if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowerid with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, DAG.getConstant(VPERMILPMask, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget->hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. if ((Mask[0] == -1 || Mask[0] < 2) && (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && (Mask[3] == -1 || Mask[3] >= 6)) { unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); } if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && (Mask[1] == -1 || Mask[1] < 2) && (Mask[2] == -1 || Mask[2] >= 6) && (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget->hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); SmallVector WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // When the shuffle is mirrored between the 128-bit lanes of the unit, we can // use lower latency instructions that will operate on both 128-bit lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { if (isSingleInputShuffleMask(Mask)) { int PSHUFDMask[] = {-1, -1, -1, -1}; for (int i = 0; i < 2; ++i) if (RepeatedMask[i] >= 0) { PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; } return DAG.getNode( ISD::BITCAST, DL, MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); } // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); } // AVX2 provides a direct instruction for permuting a single input across // lanes. if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the // repeated mask into a simulated v4f32 mask. for (int i = 0; i < 4; ++i) if (RepeatedMask[i] >= 8) RepeatedMask[i] -= 4; return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (isSingleInputShuffleMask(Mask)) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(Mask[i], MVT::i32); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return DAG.getNode( X86ISD::VPERMILPV, DL, MVT::v8f32, V1, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); if (Subtarget->hasAVX2()) return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget->hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); } // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (isSingleInputShuffleMask(Mask)) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(Mask[i], MVT::i32); return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8i32, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, // First 128-bit lane: 0, 16, 1, 17, 2, 18, 3, 19, // Second 128-bit lane: 8, 24, 9, 25, 10, 26, 11, 27)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); if (isShuffleEquivalent(Mask, // First 128-bit lane: 4, 20, 5, 21, 6, 22, 7, 23, // Second 128-bit lane: 12, 28, 13, 29, 14, 30, 15, 31)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); continue; } int M = i < 8 ? Mask[i] : Mask[i] - 8; assert(M >= 0 && M < 8 && "Invalid single-input mask!"); PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i16, DAG.getNode( X86ISD::PSHUFB, DL, MVT::v32i8, DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. // Note that these are repeated 128-bit lane unpacks, not unpacks across all // 256-bit lanes. if (isShuffleEquivalent( Mask, // First 128-bit lane: 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, // Second 128-bit lane: 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); if (isShuffleEquivalent( Mask, // First 128-bit lane: 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, // Second 128-bit lane: 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i8 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); SDValue PSHUFBMask[32]; for (int i = 0; i < 32; ++i) PSHUFBMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); return DAG.getNode( X86ISD::PSHUFB, DL, MVT::v32i8, V1, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); // There is a really nice hard cut-over between AVX1 and AVX2 that means we can // check for those subtargets here and avoid much of the subtarget querying in // the per-vector-type lowering routines. With AVX1 we have essentially *zero* // ability to manipulate a 256-bit vector with integer types. Since we'll use // floating point types there eventually, just immediately cast everything to // a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget->hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) // No floating point type available, decompose into 128-bit vectors. return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4i64: return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8f32: return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i32: return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i16: return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i8: return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); if (isShuffleEquivalent(Mask, 2, 18, 3, 19, 6, 22, 7, 23, 10, 26, 11, 27, 14, 30, 15, 31)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); if (isShuffleEquivalent(Mask, 2, 18, 3, 19, 6, 22, 7, 23, 10, 26, 11, 27, 14, 30, 15, 31)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 512-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have supprot for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16f32: return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i32: return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i16: if (Subtarget->hasBWI()) return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); break; case MVT::v64i8: if (Subtarget->hasBWI()) return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); break; default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } // Otherwise fall back on splitting. return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef) for (int M : Mask) if (M >= NumElements) { SmallVector NewMask(Mask.begin(), Mask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; for (int M : SVOp->getMask()) if (M < 0) ++NumUndefElements; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum // indices for V2. When those are equal, try to ensure that the number of odd // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) { return DAG.getCommutedVectorShuffle(*SVOp); } else if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) if (SVOp->getMask()[i] >= NumElements) SumV2Indices += i; else if (SVOp->getMask()[i] >= 0) SumV1Indices += i; if (SumV2Indices < SumV1Indices) { return DAG.getCommutedVectorShuffle(*SVOp); } else if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) if (SVOp->getMask()[i] >= NumElements) NumV2OddIndices += i % 2; else if (SVOp->getMask()[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return DAG.getCommutedVectorShuffle(*SVOp); } } } // For each vector width, delegate to a specialized lowering routine. if (VT.getSizeInBits() == 128) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); if (VT.getSizeInBits() == 256) return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); // Force AVX-512 vectors to be scalarized for now. // FIXME: Implement AVX-512 support! if (VT.getSizeInBits() == 512) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } //===----------------------------------------------------------------------===// // Legacy vector shuffle lowering // // This code is the legacy code handling vector shuffles until the above // replaces its functionality and performance. //===----------------------------------------------------------------------===// static bool isBlendMask(ArrayRef MaskVals, MVT VT, bool hasSSE41, bool hasInt256, unsigned *MaskOut = nullptr) { MVT EltVT = VT.getVectorElementType(); // There is no blend with immediate in AVX-512. if (VT.is512BitVector()) return false; if (!hasSSE41 || EltVT == MVT::i8) return false; if (!hasInt256 && VT == MVT::v16i16) return false; unsigned MaskValue = 0; unsigned NumElems = VT.getVectorNumElements(); // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; int EltIdx = MaskVals[i]; if ((EltIdx < 0 || EltIdx == (int)i) && (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) continue; if (((unsigned)EltIdx == (i + NumElems)) && (SndLaneEltIdx < 0 || (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) MaskValue |= (1 << i); else return false; } if (MaskOut) *MaskOut = MaskValue; return true; } // Try to lower a shuffle node into a simple blend instruction. // This function assumes isBlendMask returns true for this // SuffleVectorSDNode static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, unsigned MaskValue, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); MVT EltVT = VT.getVectorElementType(); assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but " "with the wrong mask")); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); // Convert i32 vectors to floating point if it is not AVX2. // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. MVT BlendVT = VT; if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), NumElems); V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); } SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } /// In vector type \p VT, return true if the element at index \p InputIdx /// falls on a different 128-bit lane than \p OutputIdx. static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, unsigned OutputIdx) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; } /// Generate a PSHUFB if possible. Selects elements from \p V1 according to /// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to /// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a /// zero. static SDValue getPSHUFB(ArrayRef MaskVals, SDValue V1, SDLoc &dl, SelectionDAG &DAG) { MVT VT = V1.getSimpleValueType(); assert(VT.is128BitVector() || VT.is256BitVector()); MVT EltVT = VT.getVectorElementType(); unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; unsigned NumElts = VT.getVectorNumElements(); SmallVector PshufbMask; for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { int InputIdx = MaskVals[OutputIdx]; unsigned InputByteIdx; if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) InputByteIdx = 0x80; else { // Cross lane is not allowed. if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) return SDValue(); InputByteIdx = InputIdx * EltSizeInBytes; // Index is an byte offset within the 128-bit lane. InputByteIdx &= 0xf; } for (unsigned j = 0; j < EltSizeInBytes; ++j) { PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); if (InputByteIdx != 0x80) ++InputByteIdx; } } MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); if (ShufVT != VT) V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); } // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb // 3. [ssse3] 2 x pshufb + 1 x por // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) static SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); SmallVector MaskVals; // Determine if more than 1 of the words in each of the low and high quadwords // of the result come from the same quadword of one of the two inputs. Undef // mask values count as coming from any quadword, for better codegen. // // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2. unsigned LoQuad[] = { 0, 0, 0, 0 }; unsigned HiQuad[] = { 0, 0, 0, 0 }; // Indices of quads used. std::bitset<4> InputQuads; for (unsigned i = 0; i < 8; ++i) { unsigned *Quad = i < 4 ? LoQuad : HiQuad; int EltIdx = SVOp->getMaskElt(i); MaskVals.push_back(EltIdx); if (EltIdx < 0) { ++Quad[0]; ++Quad[1]; ++Quad[2]; ++Quad[3]; continue; } ++Quad[EltIdx / 4]; InputQuads.set(EltIdx / 4); } int BestLoQuad = -1; unsigned MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { if (LoQuad[i] > MaxQuad) { BestLoQuad = i; MaxQuad = LoQuad[i]; } } int BestHiQuad = -1; MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { if (HiQuad[i] > MaxQuad) { BestHiQuad = i; MaxQuad = HiQuad[i]; } } // For SSSE3, If all 8 words of the result come from only 1 quadword of each // of the two input vectors, shuffle them into one input vector so only a // single pshufb instruction is necessary. If there are more than 2 input // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; if (Subtarget->hasSSSE3()) { if (InputQuads.count() == 2 && V1Used && V2Used) { BestLoQuad = InputQuads[0] ? 0 : 1; BestHiQuad = InputQuads[2] ? 2 : 3; } if (InputQuads.count() > 2) { BestLoQuad = -1; BestHiQuad = -1; } } // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update // the shuffle mask. If a quad is scored as -1, that means that it contains // words from all 4 input quadwords. SDValue NewV; if (BestLoQuad >= 0 || BestHiQuad >= 0) { int MaskV[] = { BestLoQuad < 0 ? 0 : BestLoQuad, BestHiQuad < 0 ? 1 : BestHiQuad }; NewV = DAG.getVectorShuffle(MVT::v2i64, dl, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the // source words for the shuffle, to aid later transformations. bool AllWordsInNewV = true; bool InOrder[2] = { true, true }; for (unsigned i = 0; i != 8; ++i) { int idx = MaskVals[i]; if (idx != (int)i) InOrder[i/4] = false; if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) continue; AllWordsInNewV = false; break; } bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; if (AllWordsInNewV) { for (int i = 0; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) continue; idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; if ((idx != i) && idx < 4) pshufhw = false; if ((idx != i) && idx > 3) pshuflw = false; } V1 = NewV; V2Used = false; BestLoQuad = 0; BestHiQuad = 1; } // If we've eliminated the use of V2, and the new mask is a pshuflw or // pshufhw, that's as cheap as it gets. Return the new shuffle. if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; unsigned TargetMask = 0; NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); ShuffleVectorSDNode *SVOp = cast(NewV.getNode()); TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): getShufflePSHUFLWImmediate(SVOp); V1 = NewV.getOperand(0); return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); } } // Promote splats to a larger type which usually leads to more efficient code. // FIXME: Is this true if pshufb is available? if (SVOp->isSplat()) return PromoteSplat(SVOp, DAG); // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. if (Subtarget->hasSSSE3()) { SmallVector pshufbMask; // If we have elements from both input vectors, set the high bit of the // shuffle mask element to zero out elements that come from V2 in the V1 // mask, and elements that come from V1 in the V2 mask, so that the two // results can be OR'd together. bool TwoInputs = V1Used && V2Used; V1 = getPSHUFB(MaskVals, V1, dl, DAG); if (!TwoInputs) return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. CommuteVectorShuffleMask(MaskVals, 8); V2 = getPSHUFB(MaskVals, V2, dl, DAG); V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); } // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, // and update MaskVals with new element order. std::bitset<8> InOrder; if (BestLoQuad >= 0) { int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; for (int i = 0; i != 4; ++i) { int idx = MaskVals[i]; if (idx < 0) { InOrder.set(i); } else if ((idx / 4) == BestLoQuad) { MaskV[i] = idx & 3; InOrder.set(i); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, NewV.getOperand(0), getShufflePSHUFLWImmediate(SVOp), DAG); } } // If BestHi >= 0, generate a pshufhw to put the high elements in order, // and update MaskVals with the new element order. if (BestHiQuad >= 0) { int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; for (unsigned i = 4; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) { InOrder.set(i); } else if ((idx / 4) == BestHiQuad) { MaskV[i] = (idx & 3) + 4; InOrder.set(i); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, NewV.getOperand(0), getShufflePSHUFHWImmediate(SVOp), DAG); } } // In case BestHi & BestLo were both -1, which means each quadword has a word // from each of the four input quadwords, calculate the InOrder bitvector now // before falling through to the insert/extract cleanup. if (BestLoQuad == -1 && BestHiQuad == -1) { NewV = V1; for (int i = 0; i != 8; ++i) if (MaskVals[i] < 0 || MaskVals[i] == i) InOrder.set(i); } // The other elements are put in the right place using pextrw and pinsrw. for (unsigned i = 0; i != 8; ++i) { if (InOrder[i]) continue; int EltIdx = MaskVals[i]; if (EltIdx < 0) continue; SDValue ExtOp = (EltIdx < 8) ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, DAG.getIntPtrConstant(EltIdx)) : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, DAG.getIntPtrConstant(EltIdx - 8)); NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, DAG.getIntPtrConstant(i)); } return NewV; } /// \brief v16i16 shuffles /// /// FIXME: We only support generation of a single pshufb currently. We can /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as /// well (e.g 2 x pshufb + 1 x por). static SDValue LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); if (V2.getOpcode() != ISD::UNDEF) return SDValue(); SmallVector MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); return getPSHUFB(MaskVals, V1, dl, DAG); } // v16i8 shuffles - Prefer shuffles in the following order: // 1. [ssse3] 1 x pshufb // 2. [ssse3] 2 x pshufb + 1 x por // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, const X86Subtarget* Subtarget, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); ArrayRef MaskVals = SVOp->getMask(); // Promote splats to a larger type which usually leads to more efficient code. // FIXME: Is this true if pshufb is available? if (SVOp->isSplat()) return PromoteSplat(SVOp, DAG); // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. // If SSSE3, use 1 pshufb instruction per vector with elements in the result. if (Subtarget->hasSSSE3()) { SmallVector pshufbMask; // If all result elements are from one input vector, then only translate // undef mask values to 0x80 (zero out result) in the pshufb mask. // // Otherwise, we have elements from both input vectors, and must zero out // elements that come from V2 in the first mask, and V1 in the second mask // so that we can OR them together. for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; if (EltIdx < 0 || EltIdx >= 16) EltIdx = 0x80; pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, pshufbMask)); // As PSHUFB will zero elements with negative indices, it's safe to ignore // the 2nd operand if it's undefined or zero. if (V2.getOpcode() == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) return V1; // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. pshufbMask.clear(); for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, pshufbMask)); return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); } // No SSSE3 - Calculate in place words and then fix all out of place words // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from // the 16 different words that comprise the two doublequadword input vectors. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); SDValue NewV = V1; for (int i = 0; i != 8; ++i) { int Elt0 = MaskVals[i*2]; int Elt1 = MaskVals[i*2+1]; // This word of the result is all undef, skip it. if (Elt0 < 0 && Elt1 < 0) continue; // This word of the result is already in the correct place, skip it. if ((Elt0 == i*2) && (Elt1 == i*2+1)) continue; SDValue Elt0Src = Elt0 < 16 ? V1 : V2; SDValue Elt1Src = Elt1 < 16 ? V1 : V2; SDValue InsElt; // If Elt0 and Elt1 are defined, are consecutive, and can be load // using a single extract together, load it and store it. if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, DAG.getIntPtrConstant(Elt1 / 2)); NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, DAG.getIntPtrConstant(i)); continue; } // If Elt1 is defined, extract it from the appropriate source. If the // source byte is not also odd, shift the extracted word left 8 bits // otherwise clear the bottom 8 bits if we need to do an or. if (Elt1 >= 0) { InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, DAG.getIntPtrConstant(Elt1 / 2)); if ((Elt1 & 1) == 0) InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, DAG.getConstant(8, TLI.getShiftAmountTy(InsElt.getValueType()))); else if (Elt0 >= 0) InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, DAG.getConstant(0xFF00, MVT::i16)); } // If Elt0 is defined, extract it from the appropriate source. If the // source byte is not also even, shift the extracted word right 8 bits. If // Elt1 was also defined, OR the extracted values together before // inserting them in the result. if (Elt0 >= 0) { SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); if ((Elt0 & 1) != 0) InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, DAG.getConstant(8, TLI.getShiftAmountTy(InsElt0.getValueType()))); else if (Elt1 >= 0) InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, DAG.getConstant(0x00FF, MVT::i16)); InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) : InsElt0; } NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, DAG.getIntPtrConstant(i)); } return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } // v32i8 shuffles - Translate to VPSHUFB if possible. static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); SmallVector MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); // VPSHUFB may be generated if // (1) one of input vector is undefined or zeroinitializer. // The mask value 0x80 puts 0 in the corresponding slot of the vector. // And (2) the mask indexes don't cross the 128-bit lane. if (VT != MVT::v32i8 || !Subtarget->hasInt256() || (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) return SDValue(); if (V1IsAllZero && !V2IsAllZero) { CommuteVectorShuffleMask(MaskVals, 32); V1 = V2; } return getPSHUFB(MaskVals, V1, dl, DAG); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in /// the right sequence. e.g. /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; unsigned Scale; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected!"); case MVT::v2i64: case MVT::v2f64: return SDValue(SVOp, 0); case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; } SmallVector MaskVec; for (unsigned i = 0; i != NumElems; i += Scale) { int StartIdx = -1; for (unsigned j = 0; j != Scale; ++j) { int EltIdx = SVOp->getMaskElt(i+j); if (EltIdx < 0) continue; if (StartIdx < 0) StartIdx = (EltIdx / Scale); if (EltIdx != (int)(StartIdx*Scale + j)) return SDValue(); } MaskVec.push_back(StartIdx); } SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); } /// getVZextMovL - Return a zero-extending vector move low node. /// static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { LoadSDNode *LD = nullptr; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) LD = dyn_cast(SrcOp); if (!LD) { // movssrr and movsdrr do not clear top bits. Try to use movd, movq // instead. MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { // PR2108 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, OpVT, SrcOp.getOperand(0) .getOperand(0)))); } } } return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, SrcOp))); } /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); if (NewOp.getNode()) return NewOp; MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; SDLoc dl(SVOp); MVT EltVT = VT.getVectorElementType(); MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); SDValue Output[2]; SmallVector Mask; for (unsigned l = 0; l < 2; ++l) { // Build a shuffle mask for the output, discovering on the fly which // input vectors to use as shuffle operands (recorded in InputUsed). // If building a suitable shuffle vector proves too hard, then bail // out with UseBuildVector set. bool UseBuildVector = false; int InputUsed[2] = { -1, -1 }; // Not yet discovered. unsigned LaneStart = l * NumLaneElems; for (unsigned i = 0; i != NumLaneElems; ++i) { // The mask element. This indexes into the input. int Idx = SVOp->getMaskElt(i+LaneStart); if (Idx < 0) { // the mask element does not index into any input vector. Mask.push_back(-1); continue; } // The input vector this mask element indexes into. int Input = Idx / NumLaneElems; // Turn the index into an offset from the start of the input vector. Idx -= Input * NumLaneElems; // Find or create a shuffle vector operand to hold this input. unsigned OpNo; for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { if (InputUsed[OpNo] == Input) // This input vector is already an operand. break; if (InputUsed[OpNo] < 0) { // Create a new operand for this input vector. InputUsed[OpNo] = Input; break; } } if (OpNo >= array_lengthof(InputUsed)) { // More than two input vectors used! Give up on trying to create a // shuffle vector. Insert all elements into a BUILD_VECTOR instead. UseBuildVector = true; break; } // Add the mask index for the new shuffle vector. Mask.push_back(Idx + OpNo * NumLaneElems); } if (UseBuildVector) { SmallVector SVOps; for (unsigned i = 0; i != NumLaneElems; ++i) { // The mask element. This indexes into the input. int Idx = SVOp->getMaskElt(i+LaneStart); if (Idx < 0) { SVOps.push_back(DAG.getUNDEF(EltVT)); continue; } // The input vector this mask element indexes into. int Input = Idx / NumElems; // Turn the index into an offset from the start of the input vector. Idx -= Input * NumElems; // Extract the vector element by hand. SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SVOp->getOperand(Input), DAG.getIntPtrConstant(Idx))); } // Construct the output using a BUILD_VECTOR. Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. Output[l] = DAG.getUNDEF(NVT); } else { SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), (InputUsed[0] % 2) * NumLaneElems, DAG, dl); // If only one input was used, use an undefined vector for the other. SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), (InputUsed[1] % 2) * NumLaneElems, DAG, dl); // At least one input vector was used. Create a new shuffle vector. Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); } Mask.clear(); } // Concatenate the result back return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with /// 4 elements, and match them with several different shuffle types. static SDValue LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); MVT VT = SVOp->getSimpleValueType(0); assert(VT.is128BitVector() && "Unsupported vector size"); std::pair Locs[4]; int Mask1[] = { -1, -1, -1, -1 }; SmallVector PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); unsigned NumHi = 0; unsigned NumLo = 0; for (unsigned i = 0; i != 4; ++i) { int Idx = PermMask[i]; if (Idx < 0) { Locs[i] = std::make_pair(-1, -1); } else { assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); if (Idx < 4) { Locs[i] = std::make_pair(0, NumLo); Mask1[NumLo] = Idx; NumLo++; } else { Locs[i] = std::make_pair(1, NumHi); if (2+NumHi < 4) Mask1[2+NumHi] = Idx; NumHi++; } } } if (NumLo <= 2 && NumHi <= 2) { // If no more than two elements come from either vector. This can be // implemented with two shuffles. First shuffle gather the elements. // The second shuffle, which takes the first shuffle as both of its // vector operands, put the elements into the right order. V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); int Mask2[] = { -1, -1, -1, -1 }; for (unsigned i = 0; i != 4; ++i) if (Locs[i].first != -1) { unsigned Idx = (i < 2) ? 0 : 4; Idx += Locs[i].first * 2 + Locs[i].second; Mask2[i] = Idx; } return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); } if (NumLo == 3 || NumHi == 3) { // Otherwise, we must have three elements from one vector, call it X, and // one element from the other, call it Y. First, use a shufps to build an // intermediate vector with the one element from Y and the element from X // that will be in the same half in the final destination (the indexes don't // matter). Then, use a shufps to build the final vector, taking the half // containing the element from Y from the intermediate, and the other half // from X. if (NumHi == 3) { // Normalize it so the 3 elements come from V1. CommuteVectorShuffleMask(PermMask, 4); std::swap(V1, V2); } // Find the element from V2. unsigned HiIndex; for (HiIndex = 0; HiIndex < 3; ++HiIndex) { int Val = PermMask[HiIndex]; if (Val < 0) continue; if (Val >= 4) break; } Mask1[0] = PermMask[HiIndex]; Mask1[1] = -1; Mask1[2] = PermMask[HiIndex^1]; Mask1[3] = -1; V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); if (HiIndex >= 2) { Mask1[0] = PermMask[0]; Mask1[1] = PermMask[1]; Mask1[2] = HiIndex & 1 ? 6 : 4; Mask1[3] = HiIndex & 1 ? 4 : 6; return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); } Mask1[0] = HiIndex & 1 ? 2 : 0; Mask1[1] = HiIndex & 1 ? 0 : 2; Mask1[2] = PermMask[2]; Mask1[3] = PermMask[3]; if (Mask1[2] >= 0) Mask1[2] += 4; if (Mask1[3] >= 0) Mask1[3] += 4; return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); } // Break it into (shuffle shuffle_hi, shuffle_lo). int LoMask[] = { -1, -1, -1, -1 }; int HiMask[] = { -1, -1, -1, -1 }; int *MaskPtr = LoMask; unsigned MaskIdx = 0; unsigned LoIdx = 0; unsigned HiIdx = 2; for (unsigned i = 0; i != 4; ++i) { if (i == 2) { MaskPtr = HiMask; MaskIdx = 1; LoIdx = 0; HiIdx = 2; } int Idx = PermMask[i]; if (Idx < 0) { Locs[i] = std::make_pair(-1, -1); } else if (Idx < 4) { Locs[i] = std::make_pair(MaskIdx, LoIdx); MaskPtr[LoIdx] = Idx; LoIdx++; } else { Locs[i] = std::make_pair(MaskIdx, HiIdx); MaskPtr[HiIdx] = Idx; HiIdx++; } } SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); int MaskOps[] = { -1, -1, -1, -1 }; for (unsigned i = 0; i != 4; ++i) if (Locs[i].first != -1) MaskOps[i] = Locs[i].first * 4 + Locs[i].second; return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); } static bool MayFoldVectorLoad(SDValue V) { while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) // BUILD_VECTOR (load), undef V = V.getOperand(0); return MayFoldLoad(V); } static SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Canonizalize to v2f64. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); return DAG.getNode(ISD::BITCAST, dl, VT, getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, V1, DAG)); } static SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); if (HasSSE2 && VT == MVT::v2f64) return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) return DAG.getNode(ISD::BITCAST, dl, VT, getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); } static SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); assert((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"); if (V2.getOpcode() == ISD::UNDEF) V2 = V1; // v4i32 or v4f32 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); } static SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second // operand of these instructions is only memory, so check if there's a // potencial load folding here, otherwise use SHUFPS or MOVSD to match the // same masks. bool CanFoldLoad = false; // Trivial case, when V2 comes from a load. if (MayFoldVectorLoad(V2)) CanFoldLoad = true; // When V1 is a load, it can be folded later into a store in isel, example: // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) // turns into: // (MOVLPSmr addr:$src1, VR128:$src2) // So, recognize this potential and also use MOVLPS or MOVLPD else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) CanFoldLoad = true; ShuffleVectorSDNode *SVOp = cast(Op); if (CanFoldLoad) { if (HasSSE2 && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) // If we don't care about the second element, proceed to use movss. if (SVOp->getMaskElt(1) != -1) return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); } // movl and movlp will both match v2i64, but v2i64 is never matched by // movl earlier because we make it strict to avoid messing with the movlp load // folding logic (see the code above getMOVLP call). Match it here then, // this is horrible, but will stay like this until we move all shuffle // matching to x86 specific nodes. Note that for the 1st condition all // types are matched with movsd. if (HasSSE2) { // FIXME: isMOVLMask should be checked and matched before getMOVLP, // as to remove this logic from here, as much as possible if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); } assert(VT != MVT::v4i32 && "unsupported shuffle type"); // Invert the operand order and use SHUFPS to match it. return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, getShuffleSHUFImmediate(SVOp), DAG); } static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, SelectionDAG &DAG) { SDLoc dl(Load); MVT VT = Load->getSimpleValueType(0); MVT EVT = VT.getVectorElementType(); SDValue Addr = Load->getOperand(1); SDValue NewAddr = DAG.getNode( ISD::ADD, dl, Addr.getSimpleValueType(), Addr, DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); SDValue NewLoad = DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Load->getMemOperand(), 0, EVT.getStoreSize())); return NewLoad; } // It is only safe to call this function if isINSERTPSMask is true for // this shufflevector mask. static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, SelectionDAG &DAG) { // Generate an insertps instruction when inserting an f32 from memory onto a // v4f32 or when copying a member from one v4f32 to another. // We also use it for transferring i32 from one register to another, // since it simply copies the same bits. // If we're transferring an i32 from memory to a specific element in a // register, we output a generic DAG that will match the PINSRD // instruction. MVT VT = SVOp->getSimpleValueType(0); MVT EVT = VT.getVectorElementType(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); auto Mask = SVOp->getMask(); assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"); auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; auto FromV2Predicate = [](const int &i) { return i >= 4; }; int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); SDValue From; SDValue To; unsigned DestIndex; if (FromV1 == 1) { From = V1; To = V2; DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - Mask.begin(); // If we have 1 element from each vector, we have to check if we're // changing V1's element's place. If so, we're done. Otherwise, we // should assume we're changing V2's element's place and behave // accordingly. int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); assert(DestIndex <= INT32_MAX && "truncated destination index"); if (FromV1 == FromV2 && static_cast(DestIndex) == Mask[DestIndex] % 4) { From = V2; To = V1; DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } } else { assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && "More than one element from V1 and from V2, or no elements from one " "of the vectors. This case should not have returned true from " "isINSERTPSMask"); From = V2; To = V1; DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } // Get an index into the source vector in the range [0,4) (the mask is // in the range [0,8) because it can address V1 and V2) unsigned SrcIndex = Mask[DestIndex] % 4; if (MayFoldLoad(From)) { // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. SDValue NewLoad = NarrowVectorLoadToElement(cast(From), SrcIndex, DAG); if (!NewLoad.getNode()) return SDValue(); if (EVT == MVT::f32) { // Create this as a scalar to vector to match the instruction pattern. SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, InsertpsMask); } else { // EVT == MVT::i32 // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT // instruction, to match the PINSRD instruction, which loads an i32 to a // certain vector element. return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, DAG.getConstant(DestIndex, MVT::i32)); } } // Vector-element-to-vector SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); } // Reduce a vector shuffle to zext. static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { // PMOVZX is only available from SSE41. if (!Subtarget->hasSSE41()) return SDValue(); MVT VT = Op.getSimpleValueType(); // Only AVX2 support 256-bit vector integer extending. if (!Subtarget->hasInt256() && VT.is256BitVector()) return SDValue(); ShuffleVectorSDNode *SVOp = cast(Op); SDLoc DL(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = VT.getVectorNumElements(); // Extending is an unary operation and the element type of the source vector // won't be equal to or larger than i64. if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || VT.getVectorElementType() == MVT::i64) return SDValue(); // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. unsigned Shift = 1; // Start from 2, i.e. 1 << 1. while ((1U << Shift) < NumElems) { if (SVOp->getMaskElt(1U << Shift) == 1) break; Shift += 1; // The maximal ratio is 8, i.e. from i8 to i64. if (Shift > 3) return SDValue(); } // Check the shuffle mask. unsigned Mask = (1U << Shift) - 1; for (unsigned i = 0; i != NumElems; ++i) { int EltIdx = SVOp->getMaskElt(i); if ((i & Mask) != 0 && EltIdx != -1) return SDValue(); if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) return SDValue(); } unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; MVT NeVT = MVT::getIntegerVT(NBits); MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); if (isZeroShuffle(SVOp)) return getZeroVector(VT, Subtarget, DAG, dl); // Handle splat operations if (SVOp->isSplat()) { // Use vbroadcast whenever the splat comes from a foldable load SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; } // Check integer expanding shuffles. SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || VT == MVT::v32i8) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if (VT.is128BitVector() && Subtarget->hasSSE2()) { // FIXME: Figure out a cleaner way to do this. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { MVT NewVT = NewOp.getSimpleValueType(); if (isCommutedMOVLMask(cast(NewOp)->getMask(), NewVT, true, false)) return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { MVT NewVT = NewOp.getSimpleValueType(); if (isMOVLMask(cast(NewOp)->getMask(), NewVT)) return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, dl); } } } return SDValue(); } SDValue X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); unsigned NumElems = VT.getVectorNumElements(); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; bool HasSSE2 = Subtarget->hasSSE2(); bool HasFp256 = Subtarget->hasFp256(); bool HasInt256 = Subtarget->hasInt256(); MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); // Check if we should use the experimental vector shuffle lowering. If so, // delegate completely to that code path. if (ExperimentalVectorShuffleLowering) return lowerVectorShuffle(Op, Subtarget, DAG); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Vector shuffle lowering takes 3 steps: // // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. // 2) Matching of shuffles with known shuffle masks to x86 target specific // shuffle nodes. // 3) Rewriting of unmatched masks into new generic shuffle operations, // so the shuffle can be broken into other shuffles and the legalizer can // try the lowering again. // // The general idea is that no vector_shuffle operation should be left to // be matched during isel, all of them must be converted to a target specific // node here. // Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. The actual code // doesn't include all of those, work in progress... SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; SmallVector M(SVOp->getMask().begin(), SVOp->getMask().end()); // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && V2IsUndef && MayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); if (isMOVHLPS_v_undef_Mask(M, VT)) return getMOVHighToLow(Op, dl, DAG); // Use to match splats if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); if (isPSHUFDMask(M, VT)) { // The actual implementation will match the mask in the if above and then // during isel it can match several different instructions, not only pshufd // as its name says, sad but true, emulate the behavior for now... if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); unsigned TargetMask = getShuffleSHUFImmediate(SVOp); if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, DAG); return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, TargetMask, DAG); } if (isPALIGNRMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, getShufflePALIGNRImmediate(SVOp), DAG); if (isVALIGNMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, getShuffleVALIGNImmediate(SVOp), DAG); // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; SDValue ShVal; bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. MVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } if (isMOVLMask(M, VT)) { if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); if (!isMOVLPMask(M, VT)) { if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); if (VT == MVT::v4i32 || VT == MVT::v4f32) return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); } } // FIXME: fold these into legal mask. if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) return getMOVLowToHigh(Op, dl, DAG, HasSSE2); if (isMOVHLPSMask(M, VT)) return getMOVHighToLow(Op, dl, DAG); if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); if (isMOVLPMask(M, VT)) return getMOVLP(Op, dl, DAG, HasSSE2); if (ShouldXformToMOVHLPS(M, VT) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) return DAG.getCommutedVectorShuffle(*SVOp); if (isShift) { // No better options. Use a vshldq / vsrldq. MVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. BitVector UndefElements; if (auto *BVOp = dyn_cast(V1.getNode())) if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) V1IsSplat = true; if (auto *BVOp = dyn_cast(V2.getNode())) if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) V2IsSplat = true; // Canonicalize the splat or undef, if present, to be on the RHS. if (!V2IsUndef && V1IsSplat && !V2IsSplat) { CommuteVectorShuffleMask(M, NumElems); std::swap(V1, V2); std::swap(V1IsSplat, V2IsSplat); Commuted = true; } if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { // Shuffling low element of v1 into undef, just return v1. if (V2IsUndef) return V1; // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which // the instruction selector will not match, so get a canonical MOVL with // swapped operands to undo the commute. return getMOVL(DAG, dl, VT, V2, V1); } if (isUNPCKLMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); if (isUNPCKHMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first // element then try to match unpck{h|l} again. If match, return a // new vector_shuffle with the corrected mask.p SmallVector NewMask(M.begin(), M.end()); NormalizeMask(NewMask, NumElems); if (isUNPCKLMask(NewMask, VT, HasInt256, true)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); if (isUNPCKHMask(NewMask, VT, HasInt256, true)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } if (Commuted) { // Commute is back and try unpck* again. // FIXME: this seems wrong. CommuteVectorShuffleMask(M, NumElems); std::swap(V1, V2); std::swap(V1IsSplat, V2IsSplat); if (isUNPCKLMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); if (isUNPCKHMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } // Normalize the node to match x86 shuffle ops if needed if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) return DAG.getCommutedVectorShuffle(*SVOp); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { if (VT == MVT::v2f64 || VT == MVT::v2i64) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } if (isPSHUFHWMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, getShufflePSHUFHWImmediate(SVOp), DAG); if (isPSHUFLWMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, getShufflePSHUFLWImmediate(SVOp), DAG); unsigned MaskValue; if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), &MaskValue)) return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// // Generate target specific nodes for 128 or 256-bit shuffles only // supported in the AVX instruction set. // // Handle VMOVDDUPY permutations if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations if (isVPERMILPMask(M, VT)) { if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); } unsigned Idx; if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), Idx*(NumElems/2), DAG, dl); // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) return getINSERTPS(SVOp, dl, DAG); unsigned Imm8; if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || VT.is512BitVector()) { MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); SmallVector permclMask; for (unsigned i = 0; i != NumElems; ++i) { permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); } SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); if (V2IsUndef) // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 return DAG.getNode(X86ISD::VPERMV, dl, VT, DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1, DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2); } //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but // this is the plan. // // Handle v8i16 specifically since SSE can do byte extraction and insertion. if (VT == MVT::v8i16) { SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } if (VT == MVT::v16i16 && Subtarget->hasInt256()) { SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG); if (NewOp.getNode()) return NewOp; } if (VT == MVT::v16i8) { SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } if (VT == MVT::v32i8) { SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } // Handle all 128-bit wide vectors with 4 elements, and match them with // several different shuffle types. if (NumElems == 4 && VT.is128BitVector()) return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); // Handle general 256-bit shuffles if (VT.is256BitVector()) return LowerVECTOR_SHUFFLE_256(SVOp, DAG); return SDValue(); } // This function assumes its argument is a BUILD_VECTOR of constants or // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is // true. static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { SDValue EltCond = BuildVector->getOperand(i); SDValue SndLaneEltCond = (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; int Lane1Cond = -1, Lane2Cond = -1; if (isa(EltCond)) Lane1Cond = !isZero(EltCond); if (isa(SndLaneEltCond)) Lane2Cond = !isZero(SndLaneEltCond); if (Lane1Cond == Lane2Cond || Lane2Cond < 0) // Lane1Cond != 0, means we want the first argument. // Lane1Cond == 0, means we want the second argument. // The encoding of this argument is 0 for the first argument, 1 // for the second. Therefore, invert the condition. MaskValue |= !Lane1Cond << i; else if (Lane1Cond < 0) MaskValue |= !Lane2Cond << i; else return false; } return true; } /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend /// instruction. static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); // There is no blend with immediate in AVX-512. if (VT.is512BitVector()) return SDValue(); if (!Subtarget->hasSSE41() || EltVT == MVT::i8) return SDValue(); if (!Subtarget->hasInt256() && VT == MVT::v16i16) return SDValue(); if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); // Check the mask for BLEND and build the value. unsigned MaskValue = 0; if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) return SDValue(); // Convert i32 vectors to floating point if it is not AVX2. // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. MVT BlendVT = VT; if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), NumElems); LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); } SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; // Some types for vselect were previously set to Expand, not Legal or // Custom. Return an empty SDValue so we fall-through to Expand, after // the Custom lowering phase. MVT VT = Op.getSimpleValueType(); switch (VT.SimpleTy) { default: break; case MVT::v8i16: case MVT::v16i16: if (Subtarget->hasBWI() && Subtarget->hasVLX()) break; return SDValue(); } // We couldn't create a "Blend with immediate" node. // This node should still be legal, but we'll have to emit a blendv* // instruction. return Op; } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT.getSizeInBits() == 16) { unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), Op.getOperand(1))); SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || (isa(Op.getOperand(1)) && cast(Op.getOperand(1))->isNullValue())) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa(Op.getOperand(1))) return Op; } return SDValue(); } /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. SDValue X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512 if (!isa(Idx)) { MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } unsigned IdxVal = cast(Idx)->getZExtValue(); const TargetRegisterClass* rc = getRegClassFor(VecVT); unsigned MaxSift = rc->getSize()*8 - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, MVT::i8)); Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, DAG.getConstant(MaxSift, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, DAG.getIntPtrConstant(0)); } SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); if (Op.getSimpleValueType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { if (VecVT.is512BitVector() || (VecVT.is256BitVector() && Subtarget->hasInt256() && VecVT.getVectorElementType().getSizeInBits() == 32)) { MVT MaskEltVT = MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / MaskEltVT.getSizeInBits()); Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, DAG.getConstant(0, getPointerTy())); SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, DAG.getConstant(0, getPointerTy())); } return SDValue(); } // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { unsigned IdxVal = cast(Idx)->getZExtValue(); // Get the 128-bit vector. Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); //if (IdxVal >= NumElems/2) // IdxVal -= NumElems/2; IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, MVT::i32)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); if (Res.getNode()) return Res; } MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. MVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT.getSizeInBits() == 32) { unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); if (Idx == 0) return Op; // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast(Idx), -1, -1, -1 }; MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0)); } if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); if (Idx == 0) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0)); } return SDValue(); } /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. SDValue X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT VecVT = Vec.getSimpleValueType(); if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } unsigned IdxVal = cast(Idx)->getZExtValue(); SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); if (Vec.getOpcode() == ISD::UNDEF) return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, MVT::i8)); const TargetRegisterClass* rc = getRegClassFor(VecVT); unsigned MaxSift = rc->getSize()*8 - 1; EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, DAG.getConstant(MaxSift, MVT::i8)); EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, DAG.getConstant(MaxSift - IdxVal, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); if (!isa(N2)) return SDValue(); auto *N2C = cast(N2); unsigned IdxVal = N2C->getZExtValue(); // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { // Get the desired 128-bit vector half. SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, MVT::i32)); // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); if (Subtarget->hasSSE41()) { if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { unsigned Opc; if (VT == MVT::v8i16) { Opc = X86ISD::PINSRW; } else { assert(VT == MVT::v16i8); Opc = X86ISD::PINSRB; } // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these // bits. For example (insert (extract, 3), 2) could be matched by // putting // the '3' into bits [7:6] of X86ISD::INSERTPS. // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. N2 = DAG.getIntPtrConstant(IdxVal << 4); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } if (EltVT == MVT::i32 || EltVT == MVT::i64) { // PINSR* works with constant index. return Op; } } if (EltVT == MVT::i8) return SDValue(); if (EltVT.getSizeInBits() == 16) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal); return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits()/128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } if (OpVT == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); } // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in // a simple subregister reference or explicit instructions to grab // upper bits of a vector. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); SDValue In = Op.getOperand(0); SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast(Idx)->getZExtValue(); MVT ResVT = Op.getSimpleValueType(); MVT InVT = In.getSimpleValueType(); if (Subtarget->hasFp256()) { if (ResVT.is128BitVector() && (InVT.is256BitVector() || InVT.is512BitVector()) && isa(Idx)) { return Extract128BitVector(In, IdxVal, DAG, dl); } if (ResVT.is256BitVector() && InVT.is512BitVector() && isa(Idx)) { return Extract256BitVector(In, IdxVal, DAG, dl); } } return SDValue(); } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (Subtarget->hasFp256()) { SDLoc dl(Op.getNode()); SDValue Vec = Op.getNode()->getOperand(0); SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || Op.getNode()->getSimpleValueType(0).is512BitVector()) && SubVec.getNode()->getSimpleValueType(0).is128BitVector() && isa(Idx)) { unsigned IdxVal = cast(Idx)->getZExtValue(); return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } if (Op.getNode()->getSimpleValueType(0).is512BitVector() && SubVec.getNode()->getSimpleValueType(0).is256BitVector() && isa(Idx)) { unsigned IdxVal = cast(Idx)->getZExtValue(); return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); } } return SDValue(); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; else if (Subtarget->isPICStyleGOT()) OpFlag = X86II::MO_GOTOFF; else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()), Result); } return Result; } SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; else if (Subtarget->isPICStyleGOT()) OpFlag = X86II::MO_GOTOFF; else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), OpFlag); SDLoc DL(JT); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()), Result); return Result; } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { const char *Sym = cast(Op)->getSymbol(); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) { if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) OpFlag = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; } else if (Subtarget->isPICStyleGOT()) { OpFlag = X86II::MO_GOT; } else if (Subtarget->isPICStyleStubPIC()) { OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; } else if (Subtarget->isPICStyleStubNoDynamic()) { OpFlag = X86II::MO_DARWIN_NONLAZY; } SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); SDLoc DL(Op); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()), Result); } // For symbols that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(), false, false, false, 0); return Result; } SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget->ClassifyBlockAddressReference(); CodeModel::Model M = DAG.getTarget().getCodeModel(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); else Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), Result); } return Result; } SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); CodeModel::Model M = DAG.getTarget().getCodeModel(); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { // A direct static reference to a global. Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); Offset = 0; } else { Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); } if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); else Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(), false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, DAG.getConstant(Offset, getPointerTy())); return Result; } SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast(Op)->getGlobal(); int64_t Offset = cast(Op)->getOffset(); return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); } static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR : X86ISD::TLSADDR; if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI->setAdjustsStack(true); MFI->setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo* MFI = DAG.getMachineFunction() .getInfo(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (is64Bit) { Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations // of Base. // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); } // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), MachinePointerInfo(Ptr), false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is // initialexec. unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; } else if (model == TLSModel::InitialExec) { if (is64Bit) { OperandFlags = X86II::MO_GOTTPOFF; WrapperKind = X86ISD::WrapperRIP; } else { OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; } } else { llvm_unreachable("Unexpected model"); } // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, MachinePointerInfo::getGOT(), false, false, false, 0); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); if (Subtarget->isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget->is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel( GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), DAG.getTarget().getRelocationModel() == Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } if (Subtarget->isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && !Subtarget->is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()), Offset); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), Chain.getValue(1)); } if (Subtarget->isTargetKnownWindowsMSVC() || Subtarget->isTargetWindowsGNU()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) // mov rcx, qword [rdx+rcx*8] // mov eax, .tls$:tlsvar // [rax+rcx] contains the address // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : (Subtarget->isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C) : DAG.getExternalSymbol("_tls_array", getPointerTy())); SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, false, false, 0); // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); if (Subtarget->is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, MachinePointerInfo(), MVT::i32, false, false, false, 0); else IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), false, false, false, 0); SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), getPointerTy()); IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), false, false, false, 0); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); } llvm_unreachable("TLS not implemented for this target."); } /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, MVT::i8)) : DAG.getConstant(0, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } // If the shift amount is larger or equal than the width of a part we can't // rely on the results of shld/shrd. Insert a test and select the appropriate // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, AndNode, DAG.getConstant(0, MVT::i8)); SDValue Hi, Lo; SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } else { Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); SDLoc dl(Op); if (SrcVT.isVector()) { if (SrcVT.getVectorElementType() == MVT::i1) { MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Op.getOperand(0))); } return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); // These are really Legal; return the operand so the caller accepts it as // Legal. if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) return Op; if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && Subtarget->is64Bit()) { return Op; } unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); unsigned ByteSize = SrcVT.getSizeInBits()/8; FrameIndexSDNode *FI = dyn_cast(StackSlot); MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); MMO = DAG.getMachineFunction() .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, Ops, SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is flagged to the FILD_FLAG. This // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueType().getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; MachineMemOperand *MMO = DAG.getMachineFunction() .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, false, 0); } return Result; } // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ haddpd %xmm0, %xmm0 #else pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); SmallVector CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, APInt(64, 0x4330000000000000ULL)))); CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(), false, false, false, 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(), false, false, false, 16); SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget->hasSSE3()) { // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, S2F, 0x4E, DAG); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), Sub); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0)); } // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), MVT::f64); // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(0)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), DAG.getIntPtrConstant(0)); // Or the load with the bias. SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), DAG.getIntPtrConstant(0)); // Subtract the bias. SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. EVT DestVT = Op.getValueType(); if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, DAG.getIntPtrConstant(0)); if (DestVT.bitsGT(MVT::f64)) return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); // Handle final rounding. return Sub; } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); // #else // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; // uint4 hi = (v >> 16) | (uint4) 0x53000000; // #endif // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; SDLoc DL(Op); SDValue V = Op->getOperand(0); EVT VecIntVT = V.getValueType(); bool Is128 = VecIntVT == MVT::v4i32; EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. if (VecFloatVT != Op->getValueType(0)) return SDValue(); unsigned NumElts = VecIntVT.getVectorNumElements(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); assert(NumElts <= 8 && "The size of the constant array must be fixed"); // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 // -- 0x53000000 // - A shift: // -- v >> 16 // Create the splat vector for 0x4b000000. SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, CstLow, CstLow, CstLow, CstLow}; SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstLowArray[0], NumElts)); // Create the splat vector for 0x53000000. SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh}; SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstHighArray[0], NumElts)); // Create the right shift. SDValue CstShift = DAG.getConstant(16, MVT::i32); SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, CstShift, CstShift, CstShift, CstShift}; SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstShiftArray[0], NumElts)); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; if (Subtarget.hasSSE41()) { EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); SDValue VecShiftBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); } else { SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, CstMask, CstMask, CstMask); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); // uint4 hi = (v >> 16) | (uint4) 0x53000000; High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). SDValue CstFAdd = DAG.getConstantFP( APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd}; SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, makeArrayRef(&CstFAddArray[0], NumElts)); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); switch (SVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v4i8: case MVT::v4i16: case MVT::v8i8: case MVT::v8i16: { MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); } llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); if (Op.getValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue WordOff = DAG.getConstant(4, getPointerTy()); SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo(), false, false, 0); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), OffsetSlot, MachinePointerInfo(), false, false, 0); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); return Fild; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast(StackSlot)->getIndex(); MachineMemOperand *MMO = DAG.getMachineFunction() .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF.zext(64)), getPointerTy()); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0); SDValue Four = DAG.getIntPtrConstant(4); SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(), MVT::f32, false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); } std::pair X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // These are really Legal. if (DstTy == MVT::i32 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); if (Subtarget->is64Bit() && DstTy == MVT::i64 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); // We lower FP->int64 either into FISTP64 followed by a load from a temporary // stack slot, or into the FTOL runtime function. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); unsigned Opc; if (!IsSigned && isIntegerTypeFTOL(DstTy)) Opc = X86ISD::WIN_FTOL; else switch (DstTy.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); EVT TheVT = Op.getOperand(0).getValueType(); // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); } MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, MemSize, MemSize); if (Opc != X86ISD::WIN_FTOL) { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); } else { SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Value); SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, MVT::i32, ftol.getValue(1)); SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, MVT::i32, eax.getValue(2)); SDValue Ops[] = { eax, edx }; SDValue pair = IsReplace ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) : DAG.getMergeValues(Ops, DL); return std::make_pair(pair, SDValue()); } } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); // Optimize vectors in AVX mode: // // v8i16 -> v8i32 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. // Concat upper and lower parts. // // v4i32 -> v4i64 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) return SDValue(); if (Subtarget->hasInt256()) return DAG.getNode(X86ISD::VZEXT, dl, VT, In); SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned int NumElts = VT.getVectorNumElements(); if (NumElts != 8 && NumElts != 16) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Now we have only mask extension assert(InVT.getVectorElementType() == MVT::i1); SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); const Constant *C = (dyn_cast(Cst))->getConstantIntValue(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CP)->getAlignment(); SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); if (VT.is512BitVector()) return Brcst; return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); if (Res.getNode()) return Res; } return SDValue(); } static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_AVX512(Op, DAG); if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); if (Res.getNode()) return Res; } assert(!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()); return SDValue(); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); if (VT == MVT::i1) { assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && "Invalid scalar TRUNCATE operation"); if (InVT.getSizeInBits() >= 32) return SDValue(); In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); return DAG.getNode(ISD::TRUNCATE, DL, VT, In); } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { if (VT.getVectorElementType().getSizeInBits() >=8) return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); unsigned NumElts = InVT.getVectorNumElements(); assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); if (InVT.getSizeInBits() < 512) { MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; } SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); const Constant *C = (dyn_cast(Cst))->getConstantIntValue(); SDValue CP = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast(CP)->getAlignment(); SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2)); OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomed PSHUFB. if (Subtarget->hasInt256()) { In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); SmallVector pshufbMask; for (unsigned i = 0; i < 2; ++i) { pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); for (unsigned j = 0; j < 8; ++j) pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); static const int ShufMask[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), &ShufMask[0]); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::BITCAST, DL, VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(4)); OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; SDValue Undef = DAG.getUNDEF(MVT::v16i8); OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); // The MOVLHPS Mask: static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); } // Handle truncation of V256 to V128 using shuffles. if (!VT.is128BitVector() || !InVT.is256BitVector()) return SDValue(); assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); SmallVector MaskVec(NumElems * 2, -1); // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getNode(ISD::BITCAST, DL, NVT, In), DAG.getUNDEF(NVT), &MaskVec[0]); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0)); } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { assert(!Op.getSimpleValueType().isVector()); std::pair Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); // The node is the result. return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { std::pair Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; assert(FIST.getNode() && "Unexpected failure"); if (StackSlot.getNode()) // Load the result. return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); // The node is the result. return FIST; } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT))); } /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); // If this is a FABS and it has an FNEG user, bail out to fold the combination // into an FNABS. We'll lower the FABS after that if it is still in use. if (IsFABS) for (SDNode *User : Op->uses()) if (User->getOpcode() == ISD::FNEG) return Op; SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Assume scalar op for initialization; update for vector if needed. // Note that there are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. // Using a 16-byte mask allows folding the load of the mask with // the logic op, so it can save (~4 bytes) on code size. MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. if (VT.isVector()) { EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } unsigned EltBits = EltVT.getSizeInBits(); LLVMContext *Context = DAG.getContext(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); if (VT.isVector()) { // For a vector, cast operands to a vector type, perform the logic op, // and cast the result back to the original value type. MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); SDValue Operand = IsFNABS ? DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); } // If not vector, then scalar. unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; return DAG.getNode(BitOp, dl, VT, Operand, Mask); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Context = DAG.getContext(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); SrcVT = VT; } // And if it is bigger, shrink it first. if (SrcVT.bitsGT(VT)) { Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); SrcVT = VT; } // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. const fltSemantics &Sem = VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; const unsigned SizeInBits = VT.getSizeInBits(); SmallVector CV( VT == MVT::f64 ? 2 : 4, ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). CV[0] = ConstantFP::get(*Context, APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). // If it's a constant, we can clear it here. if (ConstantFPSDNode *Op0CN = dyn_cast(Op0)) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) return SignBit; APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { CV[0] = ConstantFP::get( *Context, APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa(Op0)) Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); // OR the magnitude value with the sign bit. return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, DAG.getConstant(1, VT)); return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } // Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget->hasSSE41()) return SDValue(); if (!Op->hasOneUse()) return SDValue(); SDNode *N = Op.getNode(); SDLoc DL(N); SmallVector Opnds; DenseMap VecInMap; SmallVector VecIns; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. Opnds.push_back(N->getOperand(0)); Opnds.push_back(N->getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; // BFS traverse all OR'd operands. if (I->getOpcode() == ISD::OR) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. e += 2; // 2 more nodes (LHS and RHS) are pushed. continue; } // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa(Idx)) return SDValue(); SDValue ExtractedFromVec = I->getOperand(0); DenseMap::iterator M = VecInMap.find(ExtractedFromVec); if (M == VecInMap.end()) { VT = ExtractedFromVec.getValueType(); // Quit if not 128/256-bit vector. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); // Quit if not the same type. if (VecInMap.begin() != VecInMap.end() && VT != VecInMap.begin()->first.getValueType()) return SDValue(); M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; VecIns.push_back(ExtractedFromVec); } M->second |= 1U << cast(Idx)->getZExtValue(); } assert((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."); unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; for (DenseMap::const_iterator I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { // Quit if not all elements are used. if (I->second != FullMask) return SDValue(); } EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); // If more than one full vectors are evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. SDValue LHS = VecIns[Slot]; SDValue RHS = VecIns[Slot + 1]; VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// \brief return true if \c Op has a use that doesn't just read flags. static bool hasNonFlagsUse(SDValue Op) { for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned UOpNo = UI.getOperandNo(); if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { // Look pass truncate. UOpNo = User->use_begin().getOperandNo(); User = *User->use_begin(); } if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) return true; } return false; } /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) // KORTEST instruction should be selected return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; bool NeedOF = false; switch (X86CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. switch (Op->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SHL: { const BinaryWithFlagsSDNode *BinNode = cast(Op.getNode()); if (BinNode->hasNoSignedWrap()) break; } default: NeedOF = true; break; } break; } } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit a CMP with 0, which is the TEST pattern. //if (Op.getValueType() == MVT::i1) // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, // DAG.getConstant(0, MVT::i1)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; // Truncate operations may prevent the merge of the SETCC instruction // and the arithmetic instruction before it. Attempt to truncate the operands // of the arithmetic instruction and use a reduced bit-width instruction. bool NeedTruncation = false; SDValue ArithOp = Op; if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { SDValue Arith = Op->getOperand(0); // Both the trunc and the arithmetic op need to have one user each. if (Arith->hasOneUse()) switch (Arith.getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::OR: case ISD::XOR: { NeedTruncation = true; ArithOp = Arith; } } } // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::ADD: // Due to an isel shortcoming, be conservative if this add is likely to be // selected as part of a load-modify-store instruction. When the root node // in a match is a store, isel doesn't know how to remap non-chain non-flag // uses of other nodes in the match, such as the ADD in this case. This // leads to the ADD being left around and reselected, with the result being // two adds in the output. Alas, even if none our users are stores, that // doesn't prove we're O.K. Ergo, if we have any parents that aren't // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require // climbing the DAG back to the root, and it doesn't seem to be worth the // effort. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC && UI->getOpcode() != ISD::STORE) goto default_case; if (ConstantSDNode *C = dyn_cast(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; } } // Otherwise use a regular EFLAGS-setting add. Opcode = X86ISD::ADD; NumOperands = 2; break; case ISD::SHL: case ISD::SRL: // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && isa(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); unsigned ShAmt = Op->getConstantOperandVal(1); if (ShAmt >= BitWidth) // Avoid undefined shifts. break; APInt Mask = ArithOp.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); if (!Mask.isSignedIntN(32)) // Avoid large immediates. break; SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, VT)); DAG.ReplaceAllUsesWith(Op, New); Op = New; } break; case ISD::AND: // If the primary and result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. if (!hasNonFlagsUse(Op)) break; // FALL THROUGH case ISD::SUB: case ISD::OR: case ISD::XOR: // Due to the ISEL shortcoming noted above, be conservative if this op is // likely to be selected as part of a load-modify-store instruction. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() == ISD::STORE) goto default_case; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); if (EFLAGS.getNode()) return EFLAGS; } Opcode = X86ISD::OR; break; } } NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::INC: case X86ISD::DEC: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); default: default_case: break; } // If we found that truncation is beneficial, perform the truncation and // update 'Op'. if (NeedTruncation) { EVT VT = Op.getValueType(); SDValue WideVal = Op->getOperand(0); EVT WideVT = WideVal.getValueType(); unsigned ConvertedOp = 0; // Use a target machine opcode to prevent further DAGCombine // optimizations that may separate the arithmetic operations // from the setcc node. switch (WideVal.getOpcode()) { default: break; case ISD::ADD: ConvertedOp = X86ISD::ADD; break; case ISD::SUB: ConvertedOp = X86ISD::SUB; break; case ISD::AND: ConvertedOp = X86ISD::AND; break; case ISD::OR: ConvertedOp = X86ISD::OR; break; case ISD::XOR: ConvertedOp = X86ISD::XOR; break; } if (ConvertedOp) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); } } } if (Opcode == 0) // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops; for (unsigned i = 0; i != NumOperands; ++i) Ops.push_back(Op.getOperand(i)); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { if (ConstantSDNode *C = dyn_cast(Op1)) { if (C->getAPIntValue() == 0) return EmitTest(Op0, X86CC, dl, DAG); if (Op0.getValueType() == MVT::i1) llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); } if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { // Do the comparison at i32 if it's smaller, besides the Atom case. // This avoids subregister aliasing issues. Keep the smaller reference // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( AttributeSet::FunctionIndex, Attribute::MinSize) && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return SDValue(Sub.getNode(), 1); } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } /// Convert a comparison if required by the subtarget. SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. if (Subtarget->hasCMov() || Cmp.getOpcode() != X86ISD::CMP || !Cmp.getOperand(0).getValueType().isFloatingPoint() || !Cmp.getOperand(1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { // FIXME: We should use instruction latency models to calculate the cost of // each potential sequence, but this is very hard to do reliably because // at least Intel's Core* chips have variable timing based on the number of // significant digits in the divisor and/or sqrt operand. if (!Subtarget->useSqrtEst()) return SDValue(); EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || (Subtarget->hasAVX() && VT == MVT::v8f32)) { RefinementSteps = 1; UseOneConstNR = false; return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { // FIXME: We should use instruction latency models to calculate the cost of // each potential sequence, but this is very hard to do reliably because // at least Intel's Core* chips have variable timing based on the number of // significant digits in the divisor. if (!Subtarget->useReciprocalEst()) return SDValue(); EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || (Subtarget->hasAVX() && VT == MVT::v8f32)) { RefinementSteps = ReciprocalEstimateRefinementSteps; return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } return SDValue(); } static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); } /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, SDLoc dl, SelectionDAG &DAG) const { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::TRUNCATE) Op1 = Op1.getOperand(0); SDValue LHS, RHS; if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (ConstantSDNode *And00C = dyn_cast(Op0.getOperand(0))) if (And00C->getZExtValue() == 1) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { APInt Zeros, Ones; DAG.computeKnownBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; RHS = Op0.getOperand(1); } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); } // Use BT if the immediate can't be encoded in a TEST instruction. if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { LHS = AndLHS; RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); } } if (LHS.getNode()) { // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (LHS.getValueType() == MVT::i8 || LHS.getValueType() == MVT::i16) LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); // If the operand types disagree, extend the shift amount to match. Since // BT ignores high bits (like shifts) we can use anyextend. if (LHS.getValueType() != RHS.getValueType()) RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, MVT::i8), BT); } return SDValue(); } /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point /// mask CMPs. static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1) { unsigned SSECC; bool Swap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: case ISD::SETGT: Swap = true; // Fallthrough case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: case ISD::SETGE: Swap = true; // Fallthrough case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; case ISD::SETULE: Swap = true; // Fallthrough case ISD::SETUGE: SSECC = 5; break; case ISD::SETULT: Swap = true; // Fallthrough case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: case ISD::SETONE: SSECC = 8; break; } if (Swap) std::swap(Op0, Op1); return SSECC; } // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); unsigned Opc = 0; bool Unsigned = false; bool Swap = false; unsigned SSECC; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: SSECC = 4; break; case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; case ISD::SETUGT: SSECC = 6; Unsigned = true; break; case ISD::SETLT: Swap = true; //fall-through case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; case ISD::SETULT: SSECC = 1; Unsigned = true; break; case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap case ISD::SETULE: Unsigned = true; //fall-through case ISD::SETLE: SSECC = 2; break; } if (Swap) std::swap(Op0, Op1); if (Opc) return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } /// \brief Try to turn a VSETULT into a VSETULE by modifying its second /// operand \p Op1. If non-trivial (for example because it's not constant) /// return an empty value. static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) { BuildVectorSDNode *BV = dyn_cast(Op1.getNode()); if (!BV) return SDValue(); MVT VT = Op1.getSimpleValueType(); MVT EVT = VT.getVectorElementType(); unsigned n = VT.getVectorNumElements(); SmallVector ULTOp1; for (unsigned i = 0; i < n; ++i) { ConstantSDNode *Elt = dyn_cast(BV->getOperand(i)); if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) return SDValue(); // Avoid underflow. APInt Val = Elt->getAPIntValue(); if (Val == 0) return SDValue(); ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { #ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); unsigned Opc = X86ISD::CMPP; if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = X86ISD::CMPM; } // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { unsigned CC0, CC1; unsigned CombineOpc; if (SetCCOpcode == ISD::SETUEQ) { CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; } else { assert(SetCCOpcode == ISD::SETONE); CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, MVT::i8)); return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); bool MaskResult = (VT.getVectorElementType() == MVT::i1); EVT OpVT = Op1.getValueType(); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. // We are not talking about 512-bit operands in this case, these // types are illegal. if (MaskResult && (OpVT.getVectorElementType().getSizeInBits() < 32 && OpVT.getVectorElementType().getSizeInBits() >= 8)) return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc; bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; bool Subus = false; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; case ISD::SETGT: Opc = X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; case ISD::SETULT: Swap = true; case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; case ISD::SETUGE: Swap = true; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); bool hasMinMax = (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) || (Subtarget->hasSSE2() && (VET == MVT::i8)); if (hasMinMax) { switch (SetCCOpcode) { default: break; case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; } if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } } bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); if (!MinMax && hasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 // pcmpeq t, <0..0> switch (SetCCOpcode) { default: break; case ISD::SETULT: { // If the comparison is against a constant we can turn this into a // setule. With psubus, setule does not require a swap. This is // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget->hasAVX()) break; SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); if (ULEOp1.getNode()) { Op1 = ULEOp1; Subus = true; Invert = false; Swap = false; } break; } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; } if (Subus) { Opc = X86ISD::SUBUS; FlipSigns = false; } } if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { assert(Subtarget->hasSSE2() && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. SDValue SB; if (FlipSigns) { SB = DAG.getConstant(0x80000000U, MVT::v4i32); } else { SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Sign, Zero, Sign, Zero); } Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. static const int MaskHi[] = { 1, 1, 3, 3 }; static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { EVT EltVT = VT.getVectorElementType(); SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); if (MinMax) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); if (Subus) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, getZeroVector(VT, Subtarget, DAG, dl)); return Result; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(2))->get(); // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && Op1.getOpcode() == ISD::Constant && cast(Op1)->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); if (NewSetCC.getNode()) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; } } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if (Op1.getOpcode() == ISD::Constant && (cast(Op1)->getZExtValue() == 1 || cast(Op1)->isNullValue()) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); bool Invert = (CC == ISD::SETNE) ^ cast(Op1)->isNullValue(); if (!Invert) return Op0; CCode = X86::GetOppositeBranchCondition(CCode); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); return SetCC; } } if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && (cast(Op1)->getZExtValue() == 1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC); } bool isFP = Op1.getSimpleValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), EFLAGS); if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); return SetCC; } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) return true; return false; } static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); EVT VT = Op1.getValueType(); SDValue CC; // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available. Otherwise fp cmovs get lowered into a less efficient branch // sequence later on. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); int SSECC = translateX86FSETCC( cast(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (SSECC != 8) { if (Subtarget->hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); } SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } if (Cond.getOpcode() == ISD::SETCC) { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) Cond = NewCond; } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isZero(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast(Cond.getOperand(0))->getZExtValue(); if ((isAllOnes(Op1) || isAllOnes(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnes(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (ConstantSDNode *YC = dyn_cast(Y)) if (YC->isNullValue() && (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, CmpOp0.getValueType()), CmpOp0); SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, MVT::i8), SDValue(Neg.getNode(), 1)); return Res; } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, MVT::i8), Cmp); if (isAllOnes(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); ConstantSDNode *N2C = dyn_cast(Op2); if (!N2C || !N2C->isNullValue()) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } } // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); if (C && C->getAPIntValue() == 1) Cond = Cond.getOperand(0); } // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME Cond = Cmp; addTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && Cond.getOperand(0).getValueType() != MVT::i8)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); } if (CondOpcode == ISD::UMULO) VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32); else VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); if (CondOpcode == ISD::UMULO) Cond = X86Op.getValue(2); else Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, MVT::i8); addTest = false; } if (addTest) { // Look pass the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); if (NewSetCC.getNode()) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; } } } if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, MVT::i8), Cond); if (isAllOnes(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } } // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Blacklist CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); MVT VTElt = VT.getVectorElementType(); MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); // SKX processor if ((InVTElt == MVT::i1) && (((Subtarget->hasBWI() && Subtarget->hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || ((Subtarget->hasBWI() && VT.is512BitVector() && VTElt.getSizeInBits() <= 16)) || ((Subtarget->hasDQI() && Subtarget->hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || ((Subtarget->hasDQI() && VT.is512BitVector() && VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned int NumElts = VT.getVectorNumElements(); if (NumElts != 8 && NumElts != 16) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; Constant *C = ConstantInt::get(*DAG.getContext(), APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CP)->getAlignment(); SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); if (VT.is512BitVector()) return Brcst; return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && (VT != MVT::v16i16 || InVT != MVT::v16i8)) return SDValue(); if (Subtarget->hasInt256()) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and // v4i32 to v4i64 // // Divide input vector into two parts // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT unsigned NumElems = InVT.getVectorNumElements(); SDValue Undef = DAG.getUNDEF(InVT); SmallVector ShufMask1(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask1[i] = i; SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); SmallVector ShufMask2(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask2[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector sext loads."); assert(RegVT.isInteger() && "We only custom lower integer vector sext loads."); // Nothing useful we can do without SSE2 shuffles. assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); ISD::LoadExtType Ext = Ld->getExtensionType(); assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."); assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { // The only way in which we have a legal 256-bit vector result but not the // integer 256-bit operations needed to directly lower a sextload is if we // have AVX1 but not AVX2. In that case, we can always emit a sextload to // a 128-bit vector and a normal sign_extend to 256-bits that should get // correctly legalized. We do this late to allow the canonical form of // sextload to persist throughout the rest of the DAG combiner -- it wants // to fold together any extensions it can, and so will fuse a sign_extend // of an sextload into a sextload targeting a wider value. SDValue Load; if (MemSz == 128) { // Just switch this to a normal load. assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!"); Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); } else { assert(MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"); // Do an sext load to a 128-bit vector type. We want to use the same // number of elements, but elements half as wide. This will end up being // recursively lowered by this routine, but will succeed as we definitely // have all the necessary features if we're using AVX1. EVT HalfEltVT = EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); Load = DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); } // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); // Finally, do a normal sign-extend to the desired register. return DAG.getSExtOrTrunc(Load, dl, RegVT); } // All sizes must be a power of two. assert(isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"); // Attempt to load the original value using scalar loads. // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; } } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && (64 <= MemSz)) SclrLoadTy = MVT::f64; // Calculate the number of scalar loads that we need to perform // in order to load our vector from memory. unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"); unsigned loadRegZize = RegSz; if (Ext == ISD::SEXTLOAD && RegSz == 256) loadRegZize /= 2; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), loadRegZize / MemVT.getScalarType().getSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); // We can't shuffle using an illegal type. assert(TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"); SmallVector Chains; SDValue Ptr = Ld->getBasePtr(); SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); for (unsigned i = 0; i < NumLoads; ++i) { // Perform a single load. SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); Chains.push_back(ScalarLoad.getValue(1)); // Create the first element type using SCALAR_TO_VECTOR in order to avoid // another round of DAGCombining. if (i == 0) Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); else Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, ScalarLoad, DAG.getIntPtrConstant(i)); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget->hasSSE41()) { SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } // Otherwise we'll shuffle the small elements in the high bits of the // larger type and perform an arithmetic shift. If the shift is not legal // it's better to scalarize. assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && "We can't implement a sext load without an arithmetic right shift!"); // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; SDValue Shuff = DAG.getVectorShuffle( WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); // Build the arithmetic shift. unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - MemVT.getVectorElementType().getSizeInBits(); Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i * SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); // Bitcast to the requested type. Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart // from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) return false; return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse() && Op.getOperand(1).getOpcode() == X86ISD::SETCC && Op.getOperand(1).hasOneUse()); } // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and // 1 and that the SETCC node has a single use. static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; ConstantSDNode *N1C = dyn_cast(Op.getOperand(1)); if (N1C && N1C->getAPIntValue() == 1) { return Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse(); } return false; } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); SDValue CC; bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && isa(Cond.getOperand(1)) && cast(Cond.getOperand(1))->isNullValue() && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || Cond.getOperand(0).getOpcode() == ISD::SSUBO || Cond.getOperand(0).getOpcode() == ISD::USUBO || Cond.getOperand(0).getOpcode() == ISD::SMULO || Cond.getOperand(0).getOpcode() == ISD::UMULO)) { Inverted = true; Cond = Cond.getOperand(0); } else { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) Cond = NewCond; } } #if 0 // FIXME: LowerXALUO doesn't handle these!! else if (Cond.getOpcode() == X86ISD::ADD || Cond.getOpcode() == X86ISD::SUB || Cond.getOpcode() == X86ISD::SMUL || Cond.getOpcode() == X86ISD::UMUL) Cond = LowerXALUO(Cond, DAG); #endif // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); if (C && C->getAPIntValue() == 1) Cond = Cond.getOperand(0); } // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { Cond = Cmp; addTest = false; } else { switch (cast(CC)->getZExtValue()) { default: break; case X86::COND_O: case X86::COND_B: // These can only come from an arithmetic instruction with overflow, // e.g. SADDO, UADDO. Cond = Cond.getNode()->getOperand(1); addTest = false; break; } } } CondOpcode = Cond.getOpcode(); if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && Cond.getOperand(0).getValueType() != MVT::i8)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; // Keep this in sync with LowerXALUO, otherwise we might create redundant // instructions that can't be removed afterwards (i.e. X86ISD::ADD and // X86ISD::INC). switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: if (ConstantSDNode *C = dyn_cast(RHS)) if (C->isOne()) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: if (ConstantSDNode *C = dyn_cast(RHS)) if (C->isOne()) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); } if (Inverted) X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); if (CondOpcode == ISD::UMULO) VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32); else VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); if (CondOpcode == ISD::UMULO) Cond = X86Op.getValue(2); else Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, MVT::i8); addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { SDValue Cmp = Cond.getOperand(0).getOperand(1); if (CondOpc == ISD::OR) { // Also, recognize the pattern generated by an FCMP_UNE. We can emit // two branches instead of an explicit OR instruction with a // separate test. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp)) { CC = Cond.getOperand(0).getOperand(0); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = Cond.getOperand(1).getOperand(0); Cond = Cmp; addTest = false; } } else { // ISD::AND // Also, recognize the pattern generated by an FCMP_OEQ. We can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); X86::CondCode CCode = (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, MVT::i8); Cond = Cmp; addTest = false; } } } } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. // It should be transformed during dag combiner except when the condition // is set by a arithmetics with overflow node. X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_P, MVT::i8); Cond = Cmp; addTest = false; } } } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_UNE. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_NP, MVT::i8); Cond = Cmp; addTest = false; Dest = FalseBB; } } } } if (addTest) { // Look pass the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); if (NewSetCC.getNode()) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; } } } if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getConstant(X86Cond, MVT::i8); Cond = EmitTest(Cond, X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || SplitStack; SDLoc dl(Op); if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDNode* Node = Op.getNode(); unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); SDValue Tmp1 = SDValue(Node, 0); SDValue Tmp2 = SDValue(Node, 1); SDValue Tmp3 = Node->getOperand(2); SDValue Chain = Tmp1.getOperand(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), SDLoc(Node)); SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, DAG.getConstant(-(uint64_t)Align, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), DAG.getIntPtrConstant(0, true), SDValue(), SDLoc(Node)); SDValue Ops[2] = { Tmp1, Tmp2 }; return DAG.getMergeValues(Ops, dl); } // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); unsigned Align = cast(Op.getOperand(2))->getZExtValue(); EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = getPointerTy(); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function *F = MF.getFunction(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) if (I->hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); } const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy()); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); SDValue Ops1[2] = { Value, Chain }; return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Align) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } SDValue Ops1[2] = { SP, Chain }; return DAG.getMergeValues(Ops1, dl); } } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), getPointerTy()); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); } // __va_list_tag: // gp_offset (0 - 6 * 8) // fp_offset (48 - 48 + 8 * 16) // overflow_arg_area (point to parameters coming in memory). // reg_save_area SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), MVT::i32), FIN, MachinePointerInfo(SV), false, false, 0); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), MVT::i32), FIN, MachinePointerInfo(SV, 4), false, false, 0); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), getPointerTy()); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8), false, false, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, DAG.getIntPtrConstant(8)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && "Unhandled target in LowerVAARG"); assert(Op.getNode()->getNumOperands() == 4); SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. if (ArgVT == MVT::f80) { llvm_unreachable("va_arg for f80 not yet implemented"); } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } else { llvm_unreachable("Unhandled argument type in LowerVAARG"); } if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain SmallVector InstOps; InstOps.push_back(Chain); InstOps.push_back(SrcPtr); InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); InstOps.push_back(DAG.getConstant(Align, MVT::i32)); SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, /*Volatile=*/false, /*ReadMem=*/true, /*WriteMem=*/true); Chain = VAARG.getValue(1); // Load the next argument and return it return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo(), false, false, false, 0); } static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } // getTargetVShiftByConstNode - Handle vector element shifts where the shift // amount is a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. if (VT == SrcOp.getSimpleValueType() && ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector Elts; unsigned NumElts = SrcOp->getNumOperands(); ConstantSDNode *ND; switch(Opc) { default: llvm_unreachable(nullptr); case X86ISD::VSHLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->getOpcode() == ISD::UNDEF) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType)); } break; case X86ISD::VSRLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->getOpcode() == ISD::UNDEF) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType)); } break; case X86ISD::VSRAI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->getOpcode() == ISD::UNDEF) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType)); } break; } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); } // getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast(ShAmt)) return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, CShAmt->getZExtValue(), DAG); // Change opcode to non-immediate version switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget(); if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { // Let the shuffle legalizer expand this shift amount node. SDValue Op0 = ShAmt.getOperand(0); Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); } else { // Need to build a vector containing shift amount. // SSE/AVX packed shifts only use the lower 64-bit of the shift count. SmallVector ShOps; ShOps.push_back(ShAmt); if (SVT == MVT::i32) { ShOps.push_back(DAG.getConstant(0, SVT)); ShOps.push_back(DAG.getUNDEF(SVT)); } ShOps.push_back(DAG.getUNDEF(SVT)); MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); } // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting for \p Mask when lowering masking intrinsics. static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Mask.getValueType().getSizeInBits()); SDLoc dl(Op); assert(MaskVT.isSimple() && "invalid mask type"); if (isAllOnes(Mask)) return Op; // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), DAG.getIntPtrConstant(0)); switch (Op.getOpcode()) { default: break; case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: case X86ISD::CMPMU: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); } if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). /// The mask is comming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node for /// a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (isAllOnes(Mask)) return Op; EVT VT = Op.getValueType(); SDLoc dl(Op); // The mask should be of type MVT::i1 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmadd_ps_256: case Intrinsic::x86_fma_vfmadd_pd_256: case Intrinsic::x86_fma_mask_vfmadd_ps_512: case Intrinsic::x86_fma_mask_vfmadd_pd_512: return X86ISD::FMADD; case Intrinsic::x86_fma_vfmsub_ps: case Intrinsic::x86_fma_vfmsub_pd: case Intrinsic::x86_fma_vfmsub_ps_256: case Intrinsic::x86_fma_vfmsub_pd_256: case Intrinsic::x86_fma_mask_vfmsub_ps_512: case Intrinsic::x86_fma_mask_vfmsub_pd_512: return X86ISD::FMSUB; case Intrinsic::x86_fma_vfnmadd_ps: case Intrinsic::x86_fma_vfnmadd_pd: case Intrinsic::x86_fma_vfnmadd_ps_256: case Intrinsic::x86_fma_vfnmadd_pd_256: case Intrinsic::x86_fma_mask_vfnmadd_ps_512: case Intrinsic::x86_fma_mask_vfnmadd_pd_512: return X86ISD::FNMADD; case Intrinsic::x86_fma_vfnmsub_ps: case Intrinsic::x86_fma_vfnmsub_pd: case Intrinsic::x86_fma_vfnmsub_ps_256: case Intrinsic::x86_fma_vfnmsub_pd_256: case Intrinsic::x86_fma_mask_vfnmsub_ps_512: case Intrinsic::x86_fma_mask_vfnmsub_pd_512: return X86ISD::FNMSUB; case Intrinsic::x86_fma_vfmaddsub_ps: case Intrinsic::x86_fma_vfmaddsub_pd: case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: return X86ISD::FMADDSUB; case Intrinsic::x86_fma_vfmsubadd_ps: case Intrinsic::x86_fma_vfmsubadd_pd: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: return X86ISD::FMSUBADD; } } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case INTR_TYPE_1OP_MASK_RM: { SDValue Src = Op.getOperand(1); SDValue Src0 = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode = Op.getOperand(4); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, Src0, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue RoundingMode = Op.getOperand(5); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, RoundingMode), Mask, Src0, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), Op.getOperand(2)), Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. // Example of transformation: // (i8 (int_x86_avx512_mask_pcmpeq_q_128 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> // (i8 (bitcast // (v8i1 (insert_subvector undef, // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) EVT VT = Op.getOperand(1).getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Mask.getValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } else { assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2)); } SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, MaskVT), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); case VSHIFT_MASK: return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG), Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); if (isAllOnes(Mask)) // return data as is return Op.getOperand(1); EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Mask.getValueType().getSizeInBits()); SDLoc dl(Op); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), DAG.getIntPtrConstant(0)); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, PassThru); } case BLEND: { SDValue Mask = Op.getOperand(3); EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Mask.getValueType().getSizeInBits()); SDLoc dl(Op); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), DAG.getIntPtrConstant(0)); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } case FMA_OP_MASK: { return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)), Op.getOperand(4), Op.getOperand(1), Subtarget, DAG); } default: break; } } switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::x86_avx512_mask_valign_q_512: case Intrinsic::x86_avx512_mask_valign_d_512: // Vector source operands are swapped. return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1), Op.getOperand(3)), Op.getOperand(5), Op.getOperand(4), Subtarget, DAG); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestz_256: case Intrinsic::x86_avx_ptestc_256: case Intrinsic::x86_avx_ptestnzc_256: case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { bool IsTestPacked = false; unsigned X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: // ZF = 1 X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: // CF = 1 X86CC = X86::COND_B; break; case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 X86CC = X86::COND_A; break; } SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue CC = DAG.getConstant(X86CC, MVT::i8); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_avx512_kortestz_w: case Intrinsic::x86_avx512_kortestc_w: { unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: case Intrinsic::x86_sse42_pcmpestric128: case Intrinsic::x86_sse42_pcmpistrio128: case Intrinsic::x86_sse42_pcmpestrio128: case Intrinsic::x86_sse42_pcmpistris128: case Intrinsic::x86_sse42_pcmpestris128: case Intrinsic::x86_sse42_pcmpistriz128: case Intrinsic::x86_sse42_pcmpestriz128: { unsigned Opcode; unsigned X86CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_E; break; } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), SDValue(PCMP.getNode(), 1)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistri128: case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) Opcode = X86ISD::PCMPISTRI; else Opcode = X86ISD::PCMPESTRI; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_fma_mask_vfmadd_ps_512: case Intrinsic::x86_fma_mask_vfmadd_pd_512: case Intrinsic::x86_fma_mask_vfmsub_ps_512: case Intrinsic::x86_fma_mask_vfmsub_pd_512: case Intrinsic::x86_fma_mask_vfnmadd_ps_512: case Intrinsic::x86_fma_mask_vfnmadd_pd_512: case Intrinsic::x86_fma_mask_vfnmsub_ps_512: case Intrinsic::x86_fma_mask_vfnmsub_pd_512: case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { auto *SAE = cast(Op.getOperand(5)); if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)), Op.getOperand(4), Op.getOperand(1), Subtarget, DAG); else return SDValue(); } case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmsub_ps: case Intrinsic::x86_fma_vfmsub_pd: case Intrinsic::x86_fma_vfnmadd_ps: case Intrinsic::x86_fma_vfnmadd_pd: case Intrinsic::x86_fma_vfnmsub_ps: case Intrinsic::x86_fma_vfnmsub_pd: case Intrinsic::x86_fma_vfmaddsub_ps: case Intrinsic::x86_fma_vfmaddsub_pd: case Intrinsic::x86_fma_vfmsubadd_ps: case Intrinsic::x86_fma_vfmsubadd_pd: case Intrinsic::x86_fma_vfmadd_ps_256: case Intrinsic::x86_fma_vfmadd_pd_256: case Intrinsic::x86_fma_vfmsub_ps_256: case Intrinsic::x86_fma_vfmsub_pd_256: case Intrinsic::x86_fma_vfnmadd_ps_256: case Intrinsic::x86_fma_vfnmadd_pd_256: case Intrinsic::x86_fma_vfnmsub_ps_256: case Intrinsic::x86_fma_vfnmsub_pd_256: case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast(ScaleOp); assert(C && "Invalid scale type"); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); else MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast(ScaleOp); assert(C && "Invalid scale type"); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); else MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast(ScaleOp); assert(C && "Invalid scale type"); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); else MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that // read performance monitor counters (x86_rdpmc). static void getReadPerformanceCounter(SDNode *N, SDLoc DL, SelectionDAG &DAG, const X86Subtarget *Subtarget, SmallVectorImpl &Results) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue LO, HI; // The ECX register is used to select the index of the performance counter // to read. SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); // Reads the content of a 64-bit performance counter and returns it in the // registers EDX:EAX. if (Subtarget->is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } Chain = HI.getValue(1); if (Subtarget->is64Bit()) { // The EAX register is loaded with the low-order 32 bits. The EDX register // is loaded with the supported high-order bits of the counter. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is // also used to custom lower READCYCLECOUNTER nodes. static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget *Subtarget, SmallVectorImpl &Results) { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); SDValue LO, HI; // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. if (Subtarget->is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } SDValue Chain = HI.getValue(1); if (Opcode == X86ISD::RDTSCP_DAG) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into // the ECX register. Add 'ecx' explicitly to the chain. SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, HI.getValue(2)); // Explicitly store the content of ECX at the location passed in input // to the 'rdtscp' intrinsic. Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), MachinePointerInfo(), false, false, 0); } if (Subtarget->is64Bit()) { // The EDX register is loaded with the high-order 32 bits of the MSR, and // the EAX register is loaded with the low-order 32 bits. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallVector Results; SDLoc DL(Op); getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) return SDValue(); SDLoc dl(Op); switch(IntrData->Type) { default: llvm_unreachable("Unknown Intrinsic Type"); break; case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, Op->getValueType(1)), DAG.getConstant(X86::COND_B, MVT::i32), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); } case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal; if (dyn_cast (Hint) == nullptr || (HintVal = dyn_cast (Hint)->getZExtValue()) > 1) llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector Results; getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. case RDPMC: { SmallVector Results; getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_NE, MVT::i8), InTrans); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } // ADC/ADCX/SBB case ADX: { SmallVector Results; SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), DAG.getConstant(-1, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), Op.getOperand(4), GenCF.getValue(1)); SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), Op.getOperand(5), MachinePointerInfo(), false, false, 0); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_B, MVT::i8), Res.getValue(1)); Results.push_back(SetCC); Results.push_back(Store); return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); if (isAllOnes(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, 0); EVT VT = DataToCompress.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Mask.getValueType().getSizeInBits()); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), DAG.getIntPtrConstant(0)); SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, DAG.getUNDEF(VT)); return DAG.getStore(Chain, dl, Compressed, Addr, MachinePointerInfo(), false, false, 0); } case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PathThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); EVT VT = Op.getValueType(); if (isAllOnes(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, 0); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Mask.getValueType().getSizeInBits()); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), DAG.getIntPtrConstant(0)); SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, 0); SmallVector Results; Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru)); Results.push_back(Chain); return DAG.getMergeValues(Results, dl); } } } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); EVT PtrVT = getPointerTy(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo(), false, false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister( DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), false, false, false, 0); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT) const { unsigned Reg = StringSwitch(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Default(0); if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc dl (Op); EVT PtrVT = getPointerTy(); const X86RegisterInfo *RegInfo = static_cast( DAG.getSubtarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, DAG.getIntPtrConstant(RegInfo->getSlotSize())); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl (Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; // Large code-model. const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), Addr, MachinePointerInfo(TrmpAddr), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, MVT::i64)); OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), false, false, 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 10), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, MVT::i64)); OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), false, false, 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 20), false, false, 0); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 22), false, false, 0); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); CallingConv::ID CC = Func->getCallingConv(); unsigned NestReg; switch (CC) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::X86_StdCall: { // Pass 'nest' parameter in ECX. // Must be kept in sync with X86CallingConv.td NestReg = X86::ECX; // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); const AttributeSet &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; unsigned Idx = 1; for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasAttribute(Idx, Attribute::InReg)) // FIXME: should only count parameters that are lowered to integers. InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" " parameters!"); } } break; } case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; break; } SDValue OutChains[4]; SDValue Addr, Disp; Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(10, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, MVT::i32)); OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), false, false, 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 5), false, false, 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, MVT::i32)); OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), false, false, 1); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: 00 Round to nearest 01 Round to -inf 10 Round to +inf 11 Round to 0 FLT_ROUNDS, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we do: (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) */ MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo(), false, false, false, 0); // Transform as necessary SDValue CWD1 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x800, MVT::i16)), DAG.getConstant(11, MVT::i8)); SDValue CWD2 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x400, MVT::i16)), DAG.getConstant(9, MVT::i8)); SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i16, DAG.getNode(ISD::ADD, DL, MVT::i16, DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), DAG.getConstant(1, MVT::i16)), DAG.getConstant(3, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits+NumBits-1, OpVT), DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse). SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); // And xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); Op = Op.getOperand(0); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits, VT), DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit // ones, and then concatenate the result back. static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntArith(Op, DAG); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && "Should not custom lower when pmuldq is available!"); // Extract the odd parts. static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); // Multiply the even parts. SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); // Now multiply odd parts. SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); // AloBhi = psllqi(AloBhi, 32); // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); // Bit cast to 32-bit vectors for MULUDQ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { default: llvm_unreachable("Unexpected request for libcall!"); case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; } SDLoc dl(Op); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), false, false, 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy()); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); } static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); EVT VT = Op0.getValueType(); SDLoc dl(Op); assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. // E.g., PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> // // In other word, to have all the results, we need to perform two PMULxD: // 1. one with the even values. // 2. one with the odd values. // To achieve #2, with need to place the odd values at an even position. // // Place the odd value at an even position (basically, shift all values 1 // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; // => SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); // => SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. SDValue Highs, Lows; if (VT == MVT::v8i32) { const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } else { const int HighMask[] = {1, 5, 3, 7}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); const int LowMask[] = {0, 4, 2, 6}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget->hasSSE41()) { SDValue ShAmt = DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } // The first result of MUL_LOHI is actually the low value, followed by the // high value. SDValue Ops[] = {Lows, Highs}; return DAG.getMergeValues(Ops, dl); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); // Optimize shl/srl/sra with constant shift amount. if (auto *BVAmt = dyn_cast(Amt)) { if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { if (Op.getOpcode() == ISD::SHL) return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, DAG); if (Op.getOpcode() == ISD::SRL) return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, DAG); } if (VT == MVT::v16i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, R, ShiftAmt, DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. SmallVector V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, MVT::v8i16, R, ShiftAmt, DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. SmallVector V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { // R s>> 7 === R s< 0 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // R s>> a === ((R u>> a) ^ m) - m SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector V(16, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } llvm_unreachable("Unknown shift opcode."); } if (Subtarget->hasInt256() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v16i16, R, ShiftAmt, DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. SmallVector V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, MVT::v16i16, R, ShiftAmt, DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. SmallVector V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { // R s>> 7 === R s< 0 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // R s>> a === ((R u>> a) ^ m) - m SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector V(32, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } llvm_unreachable("Unknown shift opcode."); } } } // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; for (unsigned i = 0; i != Ratio; ++i) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } // Check remaining shift amounts. for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { uint64_t ShAmt = 0; for (unsigned j = 0; j != Ratio; ++j) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i + j)); if (!C) return SDValue(); // 6 == Log2(64) ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } if (ShAmt != ShiftAmt) return SDValue(); } switch (Op.getOpcode()) { default: llvm_unreachable("Unknown shift opcode!"); case ISD::SHL: return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, DAG); case ISD::SRL: return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); case ISD::SRA: return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, DAG); } } return SDValue(); } static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget* Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || VT == MVT::v8i32 || VT == MVT::v16i16)) || (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); if (BuildVectorSDNode *BV = dyn_cast(Amt)) { // Check if this build_vector node is doing a splat. // If so, then set BaseShAmt equal to the splat value. BaseShAmt = BV->getSplatValue(); if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); ShuffleVectorSDNode *SVN = dyn_cast(Amt); if (SVN && SVN->isSplat()) { unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && "Unexpected shuffle index found!"); BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast(InVec.getOperand(2))) { if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } } if (!BaseShAmt) // Avoid introducing an extract element from a shuffle. BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, DAG.getIntPtrConstant(SplatIdx)); } } if (BaseShAmt.getNode()) { assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown shift opcode!"); case ISD::SHL: switch (VT.SimpleTy) { default: return SDValue(); case MVT::v2i64: case MVT::v4i32: case MVT::v8i16: case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: case MVT::v16i32: case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRA: switch (VT.SimpleTy) { default: return SDValue(); case MVT::v4i32: case MVT::v8i16: case MVT::v8i32: case MVT::v16i16: case MVT::v16i32: case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRL: switch (VT.SimpleTy) { default: return SDValue(); case MVT::v2i64: case MVT::v4i32: case MVT::v8i16: case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: case MVT::v16i32: case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); } } } } // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || (Subtarget->hasAVX512() && VT == MVT::v8i64)) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); std::vector Vals(Ratio); for (unsigned i = 0; i != Ratio; ++i) Vals[i] = Amt.getOperand(i); for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { for (unsigned j = 0; j != Ratio; ++j) if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } switch (Op.getOpcode()) { default: llvm_unreachable("Unknown shift opcode!"); case ISD::SHL: return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); case ISD::SRL: return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); case ISD::SRA: return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); } } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); SDValue V; assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); V = LowerScalarImmediateShift(Op, DAG, Subtarget); if (V.getNode()) return V; V = LowerScalarVariableShift(Op, DAG, Subtarget); if (V.getNode()) return V; if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) return Op; // AVX2 has VPSLLV/VPSRAV/VPSRLV. if (Subtarget->hasInt256()) { if (Op.getOpcode() == ISD::SRL && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v4i64 || VT == MVT::v8i32)) return Op; if (Op.getOpcode() == ISD::SHL && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v4i64 || VT == MVT::v8i32)) return Op; if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) return Op; } // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. if (Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { SmallVector Elts; EVT SVT = VT.getScalarType(); unsigned SVTBits = SVT.getSizeInBits(); const APInt &One = APInt(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i=0; i !=NumElems; ++i) { SDValue Op = Amt->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { Elts.push_back(Op); continue; } ConstantSDNode *ND = cast(Op); const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); } // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } // If possible, lower this shift as a sequence of two shifts by // constant plus a MOVSS/MOVSD instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // // Could be rewritten as: // (v4i32 (MOVSS (srl A, ), (srl A, ))) // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing // the vector shift into four scalar shifts plus four pairs of vector // insert/extract. if ((VT == MVT::v8i16 || VT == MVT::v4i32) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { unsigned TargetOpcode = X86ISD::MOVSS; bool CanBeSimplified; // The splat value for the first packed shift (the 'X' from the example). SDValue Amt1 = Amt->getOperand(0); // The splat value for the second packed shift (the 'Y' from the example). SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); // See if it is possible to replace this node with a sequence of // two shifts followed by a MOVSS/MOVSD if (VT == MVT::v4i32) { // Check if it is legal to use a MOVSS. CanBeSimplified = Amt2 == Amt->getOperand(2) && Amt2 == Amt->getOperand(3); if (!CanBeSimplified) { // Otherwise, check if we can still simplify this node using a MOVSD. CanBeSimplified = Amt1 == Amt->getOperand(1) && Amt->getOperand(2) == Amt->getOperand(3); TargetOpcode = X86ISD::MOVSD; Amt2 = Amt->getOperand(2); } } else { // Do similar checks for the case where the machine value type // is MVT::v8i16. CanBeSimplified = Amt1 == Amt->getOperand(1); for (unsigned i=3; i != 8 && CanBeSimplified; ++i) CanBeSimplified = Amt2 == Amt->getOperand(i); if (!CanBeSimplified) { TargetOpcode = X86ISD::MOVSD; CanBeSimplified = true; Amt2 = Amt->getOperand(4); for (unsigned i=0; i != 4 && CanBeSimplified; ++i) CanBeSimplified = Amt1 == Amt->getOperand(i); for (unsigned j=4; j != 8 && CanBeSimplified; ++j) CanBeSimplified = Amt2 == Amt->getOperand(j); } } if (CanBeSimplified && isa(Amt1) && isa(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. EVT CastVT = MVT::v4i32; SDValue Splat1 = DAG.getConstant(cast(Amt1)->getAPIntValue(), VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = DAG.getConstant(cast(Amt2)->getAPIntValue(), VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) CastVT = MVT::v2i64; SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, BitCast1, DAG); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } } if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); // a = a << 5; Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); // Turn 'a' into a mask suitable for VSELECT SDValue VSelM = DAG.getConstant(0x80, VT); SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); SDValue CM1 = DAG.getConstant(0x0f, VT); SDValue CM2 = DAG.getConstant(0x3f, VT); // r = VSELECT(r, psllw(r & (char16)15, 4), a); SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); M = DAG.getNode(ISD::BITCAST, dl, VT, M); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); // r = VSELECT(r, psllw(r & (char16)63, 2), a); M = DAG.getNode(ISD::AND, dl, VT, R, CM2); M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); M = DAG.getNode(ISD::BITCAST, dl, VT, M); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); // return VSELECT(r, r+r, a); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, DAG.getNode(ISD::ADD, dl, VT, R, R), R); return R; } // It's worth extending once and using the v8i32 shifts for 16-bit types, but // the extra overheads to get from v16i8 to v8i32 make the existing SSE // solution better. if (Subtarget->hasInt256() && VT == MVT::v8i16) { MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, NewVT, R); Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors SDValue V1 = Extract128BitVector(R, 0, DAG, dl); SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); // Recreate the shift amount vectors SDValue Amt1, Amt2; if (Amt.getOpcode() == ISD::BUILD_VECTOR) { // Constant shift amount SmallVector Amt1Csts; SmallVector Amt2Csts; for (unsigned i = 0; i != NumElems/2; ++i) Amt1Csts.push_back(Amt->getOperand(i)); for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); } else { // Variable shift amount Amt1 = Extract128BitVector(Amt, 0, DAG, dl); Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); } // Issue new vector shifts for the smaller types V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); // Concatenate the result back return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); } return SDValue(); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" // has only one use. SDNode *N = Op.getNode(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; unsigned Cond = 0; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. if (ConstantSDNode *C = dyn_cast(RHS)) if (C->isOne()) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; } BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; case ISD::UADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_B; break; case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. if (ConstantSDNode *C = dyn_cast(RHS)) if (C->isOne()) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; } BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; case ISD::USUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_B; break; case ISD::SMULO: BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs if (N->getValueType(0) == MVT::i8) { BaseOp = X86ISD::UMUL8; Cond = X86::COND_O; break; } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(X86::COND_O, MVT::i32), SDValue(Sum.getNode(), 2)); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } // Sign extension of the low part of vector elements. This may be used either // when sign extend instructions are not available or if the vector element // sizes already match the sign-extended size. If the vector elements are in // their pre-extended size and sign extend instructions are available, that will // be handled by LowerSIGN_EXTEND. SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT ExtraVT = cast(Op.getOperand(1))->getVT(); MVT VT = Op.getSimpleValueType(); if (!Subtarget->hasSSE2() || !VT.isVector()) return SDValue(); unsigned BitsDiff = VT.getScalarType().getSizeInBits() - ExtraVT.getScalarType().getSizeInBits(); switch (VT.SimpleTy) { default: return SDValue(); case MVT::v8i32: case MVT::v16i16: if (!Subtarget->hasFp256()) return SDValue(); if (!Subtarget->hasInt256()) { // needs to be split unsigned NumElems = VT.getVectorNumElements(); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); EVT ExtraEltVT = ExtraVT.getVectorElementType(); unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, ExtraNumElems/2); SDValue Extra = DAG.getValueType(ExtraVT); LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); } // fall through case MVT::v4i32: case MVT::v8i16: { SDValue Op0 = Op.getOperand(0); // This is a sign extension of some low part of vector elements without // changing the size of the vector elements themselves: // Shift-Left + Shift-Right-Algebraic. SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, DAG); return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, DAG); } } } /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { const X86Subtarget &Subtarget = getTargetMachine().getSubtarget(); unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b else if (OpWidth == 128) return Subtarget.hasCmpxchg16b(); else return false; } bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { return needsCmpXchgNb(SI->getValueOperand()->getType()); } // Note: this turns large loads into lock cmpxchg8b/16b. // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { auto PTy = cast(LI->getPointerOperand()->getType()); return needsCmpXchgNb(PTy->getElementType()); } bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { const X86Subtarget &Subtarget = getTargetMachine().getSubtarget(); unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) return needsCmpXchgNb(MemType); AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Xchg: case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. return false; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. return !AI->use_empty(); case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return true; } } static bool hasMFENCE(const X86Subtarget& Subtarget) { // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for // no-sse2). There isn't any reason to disable it if the target processor // supports it. return Subtarget.hasSSE2() || Subtarget.is64Bit(); } LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { const X86Subtarget &Subtarget = getTargetMachine().getSubtarget(); unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SynchScope = AI->getSynchScope(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); auto Ptr = AI->getPointerOperand(); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence // is required: // Thread 0: // x.store(1, relaxed); // r1 = y.fetch_add(0, release); // Thread 1: // y.fetch_add(42, acquire); // r2 = x.load(relaxed); // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isAtLeastRelease(Order) but it is not clear // otherwise, we might be able to be more agressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SynchScope == SingleThread) { // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; } else if (hasMFENCE(Subtarget)) { Function *MFence = llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence); } else { // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare // enough that we do not bother. return nullptr; } // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SynchScope); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast( cast(Op.getOperand(1))->getZExtValue()); SynchronizationScope FenceScope = static_cast( cast(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::ESP, MVT::i32), // Base DAG.getTargetConstant(1, MVT::i8), // Scale DAG.getRegister(0, MVT::i32), // Index DAG.getTargetConstant(0, MVT::i32), // Disp DAG.getRegister(0, MVT::i32), // Segment. Zero, Chain }; SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); return SDValue(Res, 0); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT T = Op.getSimpleValueType(); SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.SimpleTy) { default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; case MVT::i64: assert(Subtarget->is64Bit() && "Node not type legal!"); Reg = X86::RAX; size = 8; break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), DAG.getTargetConstant(size, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); SDValue InVec = Op->getOperand(0); SDLoc dl(Op); unsigned NumElts = SrcVT.getVectorNumElements(); EVT SVT = SrcVT.getVectorElementType(); // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, DAG.getIntPtrConstant(i))); // Explicitly mark the extra elements as Undef. SDValue Undef = DAG.getUNDEF(SVT); for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i) Elts.push_back(Undef); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, DAG.getIntPtrConstant(0)); } assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"); // i64 <=> MMX conversions are Legal. if (SrcVT==MVT::i64 && DstVT.isVector()) return Op; if (DstVT==MVT::i64 && SrcVT.isVector()) return Op; // MMX <=> MMX conversions are Legal. if (SrcVT.isVector() && DstVT.isVector()) return Op; // All other conversions need to be expanded. return SDValue(); } static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); Op = Op.getOperand(0); EVT VT = Op.getValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "CTPOP lowering only implemented for 128/256-bit wide vector types"); unsigned NumElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); unsigned Len = EltVT.getSizeInBits(); // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // with a minor tweak to use a series of adds + shifts instead of vector // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: // // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled // v8i32 => Always profitable // // FIXME: There a couple of possible improvements: // // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html // assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && "CTPOP not implemented for this vector element type."); // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid // extra legalization. bool NeedsBitcast = EltVT == MVT::i32; MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); // v = v - ((v >> 1) & 0x55555555...) SmallVector Ones(NumElts, DAG.getConstant(1, EltVT)); SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); if (NeedsBitcast) Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); SmallVector Mask55(NumElts, Cst55); SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); if (NeedsBitcast) M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); if (VT != And.getValueType()) And = DAG.getNode(ISD::BITCAST, dl, VT, And); SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) SmallVector Mask33(NumElts, Cst33); SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); SmallVector Twos(NumElts, DAG.getConstant(2, EltVT)); SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); if (NeedsBitcast) { Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); } SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); if (VT != AndRHS.getValueType()) { AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); } SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); // v = (v + (v >> 4)) & 0x0F0F0F0F... SmallVector Fours(NumElts, DAG.getConstant(4, EltVT)); SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); SmallVector Mask0F(NumElts, Cst0F); SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); if (NeedsBitcast) { Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); } And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); if (VT != And.getValueType()) And = DAG.getNode(ISD::BITCAST, dl, VT, And); // The algorithm mentioned above uses: // v = (v * 0x01010101...) >> (Len - 8) // // Change it to use vector adds + vector shifts which yield faster results on // Haswell than using vector integer multiplication. // // For i32 elements: // v = v + (v >> 8) // v = v + (v >> 16) // // For i64 elements: // v = v + (v >> 8) // v = v + (v >> 16) // v = v + (v >> 32) // Add = And; SmallVector Csts; for (unsigned i = 8; i <= Len/2; i *= 2) { Csts.assign(NumElts, DAG.getConstant(i, EltVT)); SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); Csts.clear(); } // The result is on the least significant 6-bits on i32 and 7-bits on i64. SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); SmallVector Cst3FV(NumElts, Cst3F); SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); if (NeedsBitcast) { Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); } And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); if (VT != And.getValueType()) And = DAG.getNode(ISD::BITCAST, dl, VT, And); return And; } static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); EVT T = Node->getValueType(0); SDValue negOp = DAG.getNode(ISD::SUB, dl, T, DAG.getConstant(0, T), Node->getOperand(2)); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), negOp, cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); EVT VT = cast(Node)->getMemoryVT(); // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) // FIXME: On 32-bit, store -> fist or movq would be more efficient // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. if (cast(Node)->getOrdering() == SequentiallyConsistent || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); return Swap.getValue(1); } // Other atomic stores have a simple pattern. return Op; } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getNode()->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = X86ISD::ADD; break; case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; case ISD::SUBC: Opc = X86ISD::SUB; break; case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; } if (!ExtraOp) return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. return CallResult.first; // Returned in bits 0:31 and 32:64 xmm0. SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(0)); SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(1)); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); } } /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); if (VT != MVT::v2f32) llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(1), UNDEF); Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); return; } case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: // We don't want to expand or promote these. return; case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: case ISD::SDIVREM: case ISD::UDIVREM: { SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); Results.push_back(V); return; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) return; std::pair Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode()) { EVT VT = N->getValueType(0); // Return a load from the stack slot. if (StackSlot.getNode()) Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo(), false, false, false, 0)); else Results.push_back(FIST); } return; } case ISD::UINT_TO_FP: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (N->getOperand(0).getValueType() != MVT::v2i32 || N->getValueType(0) != MVT::v2f32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N->getOperand(0)); SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), MVT::f64); SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; } case ISD::FP_ROUND: { if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) return; SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); Results.push_back(V); return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, HalfT)); cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(1, HalfT)); cpInL = DAG.getCopyToReg(N->getOperand(0), dl, Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX, cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(0, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(1, HalfT)); swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, swapInL, cpInH.getValue(1)); swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, swapInL.getValue(1)); SDValue Ops[] = { swapInH.getValue(0), N->getOperand(1), swapInH.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(N)->getMemOperand(); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); SDValue Success = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(Success); Results.push_back(EFLAGS.getValue(1)); return; } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; } case ISD::BITCAST: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0)->getValueType(0); if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) return; unsigned NumElts = DstVT.getVectorNumElements(); EVT SVT = DstVT.getVectorElementType(); EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); if (ExperimentalVectorWideningLegalization) { // If we are legalizing vectors by widening, we already have the desired // legal vector type, just return it. Results.push_back(ToVecInt); return; } SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, ToVecInt, DAG.getIntPtrConstant(i))); Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: return nullptr; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FSRL: return "X86ISD::FSRL"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::UMAX: return "X86ISD::UMAX"; case X86ISD::UMIN: return "X86ISD::UMIN"; case X86ISD::SMAX: return "X86ISD::SMAX"; case X86ISD::SMIN: return "X86ISD::SMIN"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; case X86ISD::VSRA: return "X86ISD::VSRA"; case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; case X86ISD::SMUL8: return "X86ISD::SMUL8"; case X86ISD::UMUL8: return "X86ISD::UMUL8"; case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; case X86ISD::SELECT: return "X86ISD::SELECT"; } } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, Type *Ty) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { unsigned GVFlags = Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); // If a reference to this global requires an extra load, we can't fold it. if (isGlobalStubReference(GVFlags)) return false; // If BaseGV requires a register for the PIC base, we cannot also have a // BaseReg specified. if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) return false; // If lower 4G is not available, then we must use rip-relative addressing. if ((M != CodeModel::Small || R != Reloc::Static) && Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } switch (AM.Scale) { case 0: case 1: case 2: case 4: case 8: // These scales always work. break; case 3: case 5: case 9: // These scales are formed with basereg+scalereg. Only accept if there is // no basereg yet. if (AM.HasBaseReg) return false; break; default: // Other stuff never works. return false; } return true; } bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { unsigned Bits = Ty->getScalarSizeInBits(); // 8-bit shifts are always expensive, but versions with a scalar amount aren't // particularly cheaper than those without. if (Bits == 8) return false; // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make // variable shifts just as cheap as scalar ones. if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a // fully general vector. return true; } bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { // Can also use sub to handle negated immediates. return isInt<32>(Imm); } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); } bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) return true; if (Val.getOpcode() != ISD::LOAD) return false; if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i8: case MVT::i16: case MVT::i32: // X86 has 8, 16, and 32-bit zero-extending loads. return true; } return false; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) return false; VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); } /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { if (!VT.isSimple()) return false; MVT SVT = VT.getSimpleVT(); // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) return false; // This is an experimental legality test that is tailored to match the // legality test of the experimental lowering more closely. They are gated // separately to ease testing of performance differences. if (ExperimentalVectorShuffleLegality) // We only care that the types being shuffled are legal. The lowering can // handle any possible shuffle mask that results. return isTypeLegal(SVT); // If this is a single-input shuffle with no 128 bit lane crossings we can // lower it into pshufb. if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || (SVT.is256BitVector() && Subtarget->hasInt256())) { bool isLegal = true; for (unsigned I = 0, E = M.size(); I != E; ++I) { if (M[I] >= (int)SVT.getVectorNumElements() || ShuffleCrosses128bitLane(SVT, I, M[I])) { isLegal = false; break; } } if (isLegal) return true; } // FIXME: blends, shifts. return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || isCommutedMOVLMask(M, SVT) || isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || isSHUFPMask(M, SVT, /* Commuted */ true) || isPSHUFDMask(M, SVT) || isPSHUFDMask(M, SVT, /* SecondOperand */ true) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || isPALIGNRMask(M, SVT, Subtarget) || isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) || (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT))); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, EVT VT) const { if (!VT.isSimple()) return false; MVT SVT = VT.getSimpleVT(); // This is an experimental legality test that is tailored to match the // legality test of the experimental lowering more closely. They are gated // separately to ease testing of performance differences. if (ExperimentalVectorShuffleLegality) // The new vector shuffle lowering is very good at managing zero-inputs. return isShuffleMaskLegal(Mask, VT); unsigned NumElts = SVT.getVectorNumElements(); // FIXME: This collection of masks seems suspect. if (NumElts == 2) return true; if (NumElts == 4 && SVT.is128BitVector()) { return (isMOVLMask(Mask, SVT) || isCommutedMOVLMask(Mask, SVT, true) || isSHUFPMask(Mask, SVT) || isSHUFPMask(Mask, SVT, /* Commuted */ true) || isBlendMask(Mask, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); } return false; } //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = MBB; ++I; // For the v = xbegin(), we generate // // thisMBB: // xbegin sinkMBB // // mainMBB: // eax = -1 // // sinkMBB: // v = eax MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: // xbegin sinkMBB // # fallthrough to mainMBB // # abortion to sinkMBB BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(sinkMBB); // mainMBB: // EAX = -1 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); mainMBB->addSuccessor(sinkMBB); // sinkMBB: // EAX is live into the sinkMBB sinkMBB->addLiveIn(X86::EAX); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::EAX); MI->eraseFromParent(); return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; } DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); unsigned NumArgs = MI->getNumOperands(); for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI->getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } if (MI->hasOneMemOperand()) MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::XMM0); MI->eraseFromParent(); return BB; } // FIXME: Custom handling because TableGen doesn't support multiple implicit // defs in an instruction pattern static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; } DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); unsigned NumArgs = MI->getNumOperands(); // remove the results for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI->getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } if (MI->hasOneMemOperand()) MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::ECX); MI->eraseFromParent(); return BB; } static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII, const X86Subtarget* Subtarget) { DebugLoc dl = MI->getDebugLoc(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) MIB.addOperand(MI->getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) .addReg(MI->getOperand(ValOps).getReg()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) .addReg(MI->getOperand(ValOps+1).getReg()); // The instruction doesn't actually take any operands though. BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); MI->eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr *MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: // 0 ) Output : destination address (reg) // 1-5) Input : va_list address (addr, i64mem) // 6 ) ArgSize : Size (in bytes) of vararg type // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI->getOperand(0).getReg(); MachineOperand &Base = MI->getOperand(1); MachineOperand &Scale = MI->getOperand(2); MachineOperand &Index = MI->getOperand(3); MachineOperand &Disp = MI->getOperand(4); MachineOperand &Segment = MI->getOperand(5); unsigned ArgSize = MI->getOperand(6).getImm(); unsigned ArgMode = MI->getOperand(7).getImm(); unsigned Align = MI->getOperand(8).getImm(); // Memory Reference assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); DebugLoc DL = MI->getDebugLoc(); // struct va_list { // i32 gp_offset // i32 fp_offset // i64 overflow_area (address) // i64 reg_save_area (address) // } // sizeof(va_list) = 24 // alignment(va_list) = 8 unsigned TotalNumIntRegs = 6; unsigned TotalNumXMMRegs = 8; bool UseGPOffset = (ArgMode == 1); bool UseFPOffset = (ArgMode == 2); unsigned MaxOffset = TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; bool NeedsAlign = (Align > 8); MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *overflowMBB; MachineBasicBlock *offsetMBB; MachineBasicBlock *endMBB; unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB unsigned OffsetReg = 0; if (!UseGPOffset && !UseFPOffset) { // If we only pull from the overflow region, we don't create a branch. // We don't need to alter control flow. OffsetDestReg = 0; // unused OverflowDestReg = DestReg; offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { // First emit code to check if gp_offset (or fp_offset) is below the bound. // If so, pull the argument from reg_save_area. (branch to offsetMBB) // If not, pull from overflow_area. (branch to overflowMBB) // // thisMBB // | . // | . // offsetMBB overflowMBB // | . // | . // endMBB // Registers for the PHI in endMBB OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *MF = MBB->getParent(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = MBB; ++MBBIter; // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); MF->insert(MBBIter, overflowMBB); MF->insert(MBBIter, endMBB); // Transfer the remainder of MBB and its successor edges to endMBB. endMBB->splice(endMBB->begin(), thisMBB, std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Make offsetMBB and overflowMBB successors of thisMBB thisMBB->addSuccessor(offsetMBB); thisMBB->addSuccessor(overflowMBB); // endMBB is a successor of both offsetMBB and overflowMBB offsetMBB->addSuccessor(endMBB); overflowMBB->addSuccessor(endMBB); // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .addOperand(Segment) .setMemRefs(MMOBegin, MMOEnd); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) .addReg(OffsetReg) .addImm(MaxOffset + 8 - ArgSizeA8); // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) .addMBB(overflowMBB); } // In offsetMBB, emit code to use the reg_save_area. if (offsetMBB) { assert(OffsetReg != 0); // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, 16) .addOperand(Segment) .setMemRefs(MMOBegin, MMOEnd); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); // Add the offset to the reg_save_area to get the final address. BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) .addReg(OffsetReg64) .addReg(RegSaveReg); // Compute the offset for the next argument unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .addOperand(Segment) .addReg(NextOffsetReg) .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } // // Emit code to use overflow area // // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, 8) .addOperand(Segment) .setMemRefs(MMOBegin, MMOEnd); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) .addReg(OverflowAddrReg) .addImm(Align-1); BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) .addReg(TmpReg) .addImm(~(uint64_t)(Align-1)); } else { BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) .addReg(OverflowAddrReg); } // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, 8) .addOperand(Segment) .addReg(NextAddrReg) .setMemRefs(MMOBegin, MMOEnd); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { BuildMI(*endMBB, endMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(OffsetDestReg).addMBB(offsetMBB) .addReg(OverflowDestReg).addMBB(overflowMBB); } // Erase the pseudo instruction MI->eraseFromParent(); return endMBB; } MachineBasicBlock * X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachineInstr *MI, MachineBasicBlock *MBB) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, // however this code takes a simpler approach and just executes all // of the stores if %al is non-zero. It's less code, and it's probably // easier on the hardware branch predictor, and stores aren't all that // expensive anyway. // Create the new basic blocks. One block contains all the XMM stores, // and one block is the final destination regardless of whether any // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); MachineFunction::iterator MBBIter = MBB; ++MBBIter; MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); F->insert(MBBIter, EndMBB); // Transfer the remainder of MBB and its successor edges to EndMBB. EndMBB->splice(EndMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndMBB->transferSuccessorsAndUpdatePHIs(MBB); // The original block will now fall through to the XMM save block. MBB->addSuccessor(XMMSaveMBB); // The XMMSaveMBB will fall through to the end block. XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); if (!Subtarget->isTargetWin64()) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". assert((MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) .addFrameIndex(RegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) .addImm(/*Disp=*/Offset) .addReg(/*Segment=*/0) .addReg(MI->getOperand(i).getReg()) .addMemOperand(MMO); } MI->eraseFromParent(); // The pseudo instruction is gone now. return EndMBB; } // The EFLAGS operand of SelectItr might be missing a kill marker // because there were multiple uses of EFLAGS, and ISel didn't know // which to mark. Figure out whether SelectItr should have had a // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { // Scan forward through BB for a use/def of EFLAGS. MachineBasicBlock::iterator miI(std::next(SelectItr)); for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { const MachineInstr& mi = *miI; if (mi.readsRegister(X86::EFLAGS)) return false; if (mi.definesRegister(X86::EFLAGS)) break; // Should have kill-flag - update below. } // If we hit the end of the block, check whether EFLAGS is live into a // successor. if (miI == BB->end()) { for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), sEnd = BB->succ_end(); sItr != sEnd; ++sItr) { MachineBasicBlock* succ = *sItr; if (succ->isLiveIn(X86::EFLAGS)) return false; } } // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. SelectItr->addRegisterKilled(X86::EFLAGS, TRI); return true; } MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = BB; ++It; // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = BB->getParent()->getSubtarget().getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. unsigned Opc = X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB copy0MBB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); MI->eraseFromParent(); // The pseudo instruction is gone now. return sinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); const bool Is64Bit = Subtarget->is64Bit(); const bool IsLP64 = Subtarget->isTarget64BitLP64(); const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] // If stacklet is not large enough, jump to mallocMBB // // bumpMBB: // Allocate by subtracting from RSP // Jump to continueMBB // // mallocMBB: // Allocate by call to runtime // // continueMBB: // ... // [rest of original BB] // MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy()); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = BB; ++MBBIter; MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); MF->insert(MBBIter, continueMBB); continueMBB->splice(continueMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); continueMBB->transferSuccessorsAndUpdatePHIs(BB); // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = MF->getTarget() .getSubtargetImpl() ->getRegisterInfo() ->getCallPreservedMask(CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EDI, RegState::Implicit) .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); BB->addSuccessor(mallocMBB); mallocMBB->addSuccessor(continueMBB); bumpMBB->addSuccessor(continueMBB); // Take care of the PHI nodes. BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) .addReg(mallocPtrVReg).addMBB(mallocMBB) .addReg(bumpSPPtrVReg).addMBB(bumpMBB); // Delete the original pseudo instruction. MI->eraseFromParent(); // And we're done. return continueMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(!Subtarget->isTargetMachO()); // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. if (Subtarget->isTargetWin64()) { if (Subtarget->isTargetCygMing()) { // ___chkstk(Mingw64): // Clobbers R10, R11, RAX and EFLAGS. // Updates RSP. BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) .addExternalSymbol("___chkstk") .addReg(X86::RAX, RegState::Implicit) .addReg(X86::RSP, RegState::Implicit) .addReg(X86::RAX, RegState::Define | RegState::Implicit) .addReg(X86::RSP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); } else { // __chkstk(MSVCRT): does not update stack pointer. // Clobbers R10, R11 and EFLAGS. BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) .addExternalSymbol("__chkstk") .addReg(X86::RAX, RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); // RAX has the offset to be subtracted from RSP. BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(X86::RAX); } } else { const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() || Subtarget->isTargetWindowsItanium()) ? "_chkstk" : "_alloca"; BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol(StackProbeSymbol) .addReg(X86::EAX, RegState::Implicit) .addReg(X86::ESP, RegState::Implicit) .addReg(X86::EAX, RegState::Define | RegState::Implicit) .addReg(X86::ESP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); } MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = static_cast(F->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = F->getTarget() .getSubtargetImpl() ->getRegisterInfo() ->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) .addReg(X86::RIP) .addImm(0).addReg(0) .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) .addImm(0).addReg(0) .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(TII->getGlobalBaseReg(F)) .addImm(0).addReg(0) .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = MBB; ++I; // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); unsigned DstReg; unsigned MemOpndSlot = 0; unsigned CurOp = 0; DstReg = MI->getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(RC->hasType(MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; MVT PVT = getPointerTy(); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // buf[LabelOffset] = restoreMBB // SjLjSetup restoreMBB // // mainMBB: // v_main = 0 // // sinkMBB: // v = phi(main, restore) // // restoreMBB: // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); Reloc::Model RM = MF->getTarget().getRelocationModel(); bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); // Prepare IP either in reg or imm. if (!UseImmLabel) { PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; const TargetRegisterClass *PtrRC = getRegClassFor(PVT); LabelReg = MRI.createVirtualRegister(PtrRC); if (Subtarget->is64Bit()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(restoreMBB) .addReg(0); } else { const X86InstrInfo *XII = static_cast(TII); MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) .addReg(XII->getGlobalBaseReg(MF)) .addImm(0) .addReg(0) .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) .addReg(0); } } else PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; // Store IP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); else MIB.addOperand(MI->getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); else MIB.addMBB(restoreMBB); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = static_cast( MF->getSubtarget().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); // mainMBB: // EAX = 0 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { const X86Subtarget &STI = MF->getTarget().getSubtarget(); const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo(); X86FI->setRestoreBasePointer(MF); unsigned FramePtr = RegInfo->getFrameRegister(*MF); unsigned BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI->eraseFromParent(); return sinkMBB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); MVT PVT = getPointerTy(); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = static_cast( MF->getSubtarget().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) MIB.addOperand(MI->getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI->getOperand(i), LabelOffset); else MIB.addOperand(MI->getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI->getOperand(i), SPOffset); else MIB.addOperand(MI->getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); MI->eraseFromParent(); return MBB; } // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer // to remove extra copies in the loop. MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineOperand &AddendOp = MI->getOperand(3); // Bail out early if the addend isn't a register - we can't switch these. if (!AddendOp.isReg()) return MBB; MachineFunction &MF = *MBB->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); // Check whether the addend is defined by a PHI: assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); if (!AddendDef.isPHI()) return MBB; // Look for the following pattern: // loop: // %addend = phi [%entry, 0], [%loop, %result] // ... // %result = FMA213 %m2, %m1, %addend // Replace with: // loop: // %addend = phi [%entry, 0], [%loop, %result] // ... // %result = FMA231 %addend, %m1, %m2 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { assert(AddendDef.getOperand(i).isReg()); MachineOperand PHISrcOp = AddendDef.getOperand(i); MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); if (&PHISrcInst == MI) { // Found a matching instruction. unsigned NewFMAOpc = 0; switch (MI->getOpcode()) { case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; default: llvm_unreachable("Unrecognized FMA variant."); } const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(3)) .addOperand(MI->getOperand(2)) .addOperand(MI->getOperand(1)); MBB->insert(MachineBasicBlock::iterator(MI), MIB); MI->eraseFromParent(); } } return MBB; } MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_GR8: case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_V4F32: case X86::CMOV_V2F64: case X86::CMOV_V2I64: case X86::CMOV_V8F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: return EmitLoweredSelect(MI, BB); case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: case X86::FP64_TO_INT16_IN_MEM: case X86::FP64_TO_INT32_IN_MEM: case X86::FP64_TO_INT64_IN_MEM: case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); // Load the old value of the high byte of the control word... unsigned OldCW = F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); // Set the high part to be round to zero... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) .addImm(0xC7F); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); // Restore the memory image of control word to original value addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) .addReg(OldCW); // Get the X86 opcode to use. unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; } X86AddressMode AM; MachineOperand &Op = MI->getOperand(0); if (Op.isReg()) { AM.BaseType = X86AddressMode::RegBase; AM.Base.Reg = Op.getReg(); } else { AM.BaseType = X86AddressMode::FrameIndexBase; AM.Base.FrameIndex = Op.getIndex(); } Op = MI->getOperand(1); if (Op.isImm()) AM.Scale = Op.getImm(); Op = MI->getOperand(2); if (Op.isImm()) AM.IndexReg = Op.getImm(); Op = MI->getOperand(3); if (Op.isGlobal()) { AM.GV = Op.getGlobal(); } else { AM.Disp = Op.getImm(); } addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } // String/text processing lowering. case X86::PCMPISTRM128REG: case X86::VPCMPISTRM128REG: case X86::PCMPISTRM128MEM: case X86::VPCMPISTRM128MEM: case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: case X86::PCMPESTRM128MEM: case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: case X86::VPCMPISTRIREG: case X86::PCMPISTRIMEM: case X86::VPCMPISTRIMEM: case X86::PCMPESTRIREG: case X86::VPCMPESTRIREG: case X86::PCMPESTRIMEM: case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: return emitEHSjLjSetJmp(MI, BB); case X86::EH_SjLj_LongJmp32: case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); case TargetOpcode::STATEPOINT: // As an implementation detail, STATEPOINT shares the STACKMAP format at // this point in the process. We diverge later. return emitPatchPoint(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case X86::VFMADDPDr213r: case X86::VFMADDPSr213r: case X86::VFMADDSDr213r: case X86::VFMADDSSr213r: case X86::VFMSUBPDr213r: case X86::VFMSUBPSr213r: case X86::VFMSUBSDr213r: case X86::VFMSUBSSr213r: case X86::VFNMADDPDr213r: case X86::VFNMADDPSr213r: case X86::VFNMADDSDr213r: case X86::VFNMADDSSr213r: case X86::VFNMSUBPDr213r: case X86::VFNMSUBPSr213r: case X86::VFNMSUBSDr213r: case X86::VFNMSUBSSr213r: case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPSr213r: case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPSr213r: case X86::VFMADDPDr213rY: case X86::VFMADDPSr213rY: case X86::VFMSUBPDr213rY: case X86::VFMSUBPSr213rY: case X86::VFNMADDPDr213rY: case X86::VFNMADDPSr213rY: case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPSr213rY: case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPSr213rY: case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); } } //===----------------------------------------------------------------------===// // X86 Optimization Hooks //===----------------------------------------------------------------------===// void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. switch (Opc) { default: break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::ADC: case X86ISD::SBB: case X86ISD::SMUL: case X86ISD::UMUL: case X86ISD::INC: case X86ISD::DEC: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: // These nodes' second result is a boolean. if (Op.getResNo() == 0) break; // Fallthrough case X86ISD::SETCC: KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntId = cast(Op.getOperand(0))->getZExtValue(); unsigned NumLoBits = 0; switch (IntId) { default: break; case Intrinsic::x86_sse_movmsk_ps: case Intrinsic::x86_avx_movmsk_ps_256: case Intrinsic::x86_sse2_movmsk_pd: case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_mmx_pmovmskb: case Intrinsic::x86_sse2_pmovmskb_128: case Intrinsic::x86_avx2_pmovmskb: { // High bits of movmskp{s|d}, pmovmskb are known zero. switch (IntId) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; } KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } break; } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const SelectionDAG &, unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) return Op.getValueType().getScalarType().getSizeInBits(); // Fallback case. return 1; } /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the /// node is a GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const { if (N->getOpcode() == X86ISD::Wrapper) { if (isa(N->getOperand(0))) { GA = cast(N->getOperand(0))->getGlobal(); Offset = cast(N->getOperand(0))->getOffset(); return true; } } return TargetLowering::isGAPlusOffset(N, GA, Offset); } /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the /// same as extracting the high 128-bit part of 256-bit vector and then /// inserting the result into the low part of a new 256-bit vector static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || SVOp->getMaskElt(j) >= 0) return false; return true; } /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the /// same as extracting the low 128-bit part of 256-bit vector and then /// inserting the result into the high part of a new 256-bit vector static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); // vector_shuffle or for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || SVOp->getMaskElt(j) >= 0) return false; return true; } /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { SDLoc dl(N); ShuffleVectorSDNode *SVOp = cast(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); EVT VT = SVOp->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && V2.getOpcode() == ISD::CONCAT_VECTORS) { // // 0,0,0,... // | // V UNDEF BUILD_VECTOR UNDEF // \ / \ / // CONCAT_VECTOR CONCAT_VECTOR // \ / // \ / // RESULT: V + zero extended // if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || V2.getOperand(1).getOpcode() != ISD::UNDEF || V1.getOperand(1).getOpcode() != ISD::UNDEF) return SDValue(); if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) return SDValue(); // To match the shuffle mask, the first half of the mask should // be exactly the first vector, and all the rest a splat with the // first element of the second one. for (unsigned i = 0; i != NumElems/2; ++i) if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. if (LoadSDNode *Ld = dyn_cast(V1.getOperand(0))) { if (Ld->hasNUsesOfValue(1, 0)) { SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); // Make sure the newly-created LOAD is in the same position as Ld in // terms of dependency. We create a TokenFactor for Ld and ResNode, // and update uses of Ld's output chain to use the TokenFactor. if (Ld->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); } return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); } } // Emit a zeroed vector and insert the desired subvector on its // first half. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); return DCI.CombineTo(N, InsV); } //===--------------------------------------------------------------------===// // Combine some shuffles into subvector extracts and inserts: // // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> if (isShuffleHigh128VectorInsertLow(SVOp)) { SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); return DCI.CombineTo(N, InsV); } // vector_shuffle or if (isShuffleLow128VectorInsertHigh(SVOp)) { SDValue V = Extract128BitVector(V1, 0, DAG, dl); SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); return DCI.CombineTo(N, InsV); } return SDValue(); } /// \brief Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// /// This is the leaf of the recursive combinine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, int Depth, bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); // Find the operand that enters the chain. Note that multiple uses are OK // here, we're not going to remove the operand we find. SDValue Input = Op.getOperand(0); while (Input.getOpcode() == ISD::BITCAST) Input = Input.getOperand(0); MVT VT = Input.getSimpleValueType(); MVT RootVT = Root.getSimpleValueType(); SDLoc DL(Root); // Just remove no-op shuffle masks. if (Mask.size() == 1) { DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), /*AddTo*/ true); return true; } // Use the float domain if the operand type is a floating point type. bool FloatDomain = VT.isFloatingPoint(); // For floating point shuffles, we don't have free copies in the shuffle // instructions or the ability to load as part of the instruction, so // canonicalize their shuffles to UNPCK or MOV variants. // // Note that even with AVX we prefer the PSHUFD form of shuffle for integer // vectors because it can have a load folded into it that UNPCK cannot. This // doesn't preclude something switching to the shorter encoding post-RA. if (FloatDomain) { if (Mask.equals(0, 0) || Mask.equals(1, 1)) { bool Lo = Mask.equals(0, 0); unsigned Shuffle; MVT ShuffleVT; // Check if we have SSE3 which will let us use MOVDDUP. That instruction // is no slower than UNPCKLPD but has the option to fold the input operand // into even an unaligned memory load. if (Lo && Subtarget->hasSSE3()) { Shuffle = X86ISD::MOVDDUP; ShuffleVT = MVT::v2f64; } else { // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller // than the UNPCK variants. Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; } if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); if (Shuffle == X86ISD::MOVDDUP) Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); else Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); return true; } if (Subtarget->hasSSE3() && (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { bool Lo = Mask.equals(0, 0, 2, 2); unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); return true; } if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { bool Lo = Mask.equals(0, 0, 1, 1); unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); return true; } } // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. if (!FloatDomain && (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15))) { bool Lo = Mask[0] == 0; unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! MVT ShuffleVT; switch (Mask.size()) { case 8: ShuffleVT = MVT::v8i16; break; case 16: ShuffleVT = MVT::v16i8; break; default: llvm_unreachable("Impossible mask size!"); }; Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); return true; } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) return false; // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we // can replace them with a single PSHUFB instruction profitably. Intel's // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but // in practice PSHUFB tends to be *very* fast so we're more aggressive. if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector PSHUFBMask; assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); int Ratio = 16 / Mask.size(); for (unsigned i = 0; i < 16; ++i) { if (Mask[i / Ratio] == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } int M = Mask[i / Ratio] != SM_SentinelZero ? Ratio * Mask[i / Ratio] + i % Ratio : 255; PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); } Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); DCI.AddToWorklist(Op.getNode()); SDValue PSHUFBMaskOp = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); return true; } // Failed to find any combines. return false; } /// \brief Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once /// they have been fully optimized, this will recursively consider all chains /// of single-use shuffle instructions, build a generic model of the cumulative /// shuffle operation, and check for simpler instructions which implement this /// operation. We use this primarily for two purposes: /// /// 1) Collapse generic shuffles to specialized single instructions when /// equivalent. In most cases, this is just an encoding size win, but /// sometimes we will collapse multiple generic shuffles into a single /// special-purpose shuffle. /// 2) Look for sequences of shuffle instructions with 3 or more total /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with /// a suitable short sequence of other instructions. The PHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// /// Because this is inherently a quadratic operation (for each shuffle in /// a chain, we recurse up the chain), the depth is limited to 8 instructions. /// This should never be an issue in practice as the shuffle lowering doesn't /// produce sequences of more than 8 instructions. /// /// FIXME: We will currently miss some cases where the redundant shuffling /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, ArrayRef RootMask, int Depth, bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. if (Depth > 8) return false; // Directly rip through bitcasts to find the underlying operand. while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) Op = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return false; // Bail if we hit a non-vector. // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit // version should be added. if (VT.getSizeInBits() != 128) return false; assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && "Can only combine shuffles of the same vector register size."); if (!isTargetShuffle(Op.getOpcode())) return false; SmallVector OpMask; bool IsUnary; bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; assert(VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); int RootRatio = std::max(1, OpMask.size() / RootMask.size()); int OpRatio = std::max(1, RootMask.size() / OpMask.size()); assert(((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"); SmallVector Mask; Mask.reserve(std::max(OpMask.size(), RootMask.size())); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { int RootIdx = i / RootRatio; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. Mask.push_back(RootMask[RootIdx]); continue; } int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; int OpIdx = RootMaskedIdx / OpRatio; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. Mask.push_back(OpMask[OpIdx]); continue; } // Ok, we have non-zero lanes, map them through. Mask.push_back(OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio); } // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { case X86ISD::PSHUFB: HasPSHUFB = true; case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: if (Op.getOperand(0).hasOneUse() && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, HasPSHUFB, DAG, DCI, Subtarget)) return true; break; case X86ISD::UNPCKL: case X86ISD::UNPCKH: assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); // We can't check for single use, we have to check that this shuffle is the only user. if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, HasPSHUFB, DAG, DCI, Subtarget)) return true; break; } // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with squential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. SmallVector WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); WidenedMask.clear(); } return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, Subtarget); } /// \brief Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector getPSHUFShuffleMask(SDValue N) { SmallVector Mask; bool IsUnary; bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); (void)HaveMask; assert(HaveMask); switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; case X86ISD::PSHUFLW: Mask.resize(4); return Mask; case X86ISD::PSHUFHW: Mask.erase(Mask.begin(), Mask.begin() + 4); for (int &M : Mask) M -= 4; return Mask; default: llvm_unreachable("No valid shuffle instruction found!"); } } /// \brief Search for a combinable shuffle across a chain ending in pshufd. /// /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); // Walk up a single-use chain looking for a combinable shuffle. Keep a stack // of the shuffles in the chain so that we can form a fresh chain to replace // this one. SmallVector Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFD: // Found another dword shuffle. break; case X86ISD::PSHUFLW: // Check that the low words (being shuffled) are the identity in the // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) return SDValue(); Chain.push_back(V); continue; case X86ISD::PSHUFHW: // Check that the high words (being shuffled) are the identity in the // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) return SDValue(); Chain.push_back(V); continue; case X86ISD::UNPCKL: case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) return SDValue(); Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; Chain.push_back(V); // Fallthrough! case ISD::BITCAST: V = V.getOperand(0); continue; } break; } while (V.hasOneUse()); break; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return SDValue(); // Merge this node's mask and our incoming mask. SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); // Rebuild the chain around this new shuffle. while (!Chain.empty()) { SDValue W = Chain.pop_back_val(); if (V.getValueType() != W.getOperand(0).getValueType()) V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); switch (W.getOpcode()) { default: llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); case X86ISD::UNPCKL: case X86ISD::UNPCKH: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); break; case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); break; } } if (V.getValueType() != N.getValueType()) V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); // Return the new chain to replace N. return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking /// through shuffles which switch halves trying to find a shuffle of the same /// pair of dwords. static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert( (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); unsigned CombineOpcode = N.getOpcode(); // Walk up a single-use chain looking for a combinable shuffle. SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return false; // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOpcode) break; // Other-half shuffles are no-ops. continue; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return false; // Combine away the bottom node as its shuffle will be accumulated into // a preceding shuffle. DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); // Record the old value. SDValue Old = V; // Merge this node's mask and our incoming mask (adjusted to account for all // the pshufd instructions encountered). SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); // Check that the shuffles didn't cancel each other out. If not, we need to // combine to the new one. if (Old != V) // Replace the combinable shuffle with the combined one, updating all users // so that we re-evaluate the chain here. DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); return true; } /// \brief Try to combine x86 target specific shuffles. static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); MVT VT = N.getSimpleValueType(); SmallVector Mask; switch (N.getOpcode()) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; default: return SDValue(); } // Nuke no-op shuffles that show up after combining. if (isNoopShuffleMask(Mask)) return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); // Look for simplifications involving one or two shuffle instructions. SDValue V = N.getOperand(0); switch (N.getOpcode()) { default: break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: assert(VT == MVT::v8i16); (void)VT; if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, getV4X86ShuffleImm8ForMask(DMask, DAG)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); } // Look for shuffle patterns which can be implemented as a single unpack. // FIXME: This doesn't handle the location of the PSHUFD generically, and // only works when we have a PSHUFD followed by two half-shuffles. if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && V.hasOneUse()) { SDValue D = V.getOperand(0); while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) D = D.getOperand(0); if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { SmallVector VMask = getPSHUFShuffleMask(V); SmallVector DMask = getPSHUFShuffleMask(D); int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int WordMask[8]; for (int i = 0; i < 4; ++i) { WordMask[i + NOffset] = Mask[i] + NOffset; WordMask[i + VOffset] = VMask[i] + VOffset; } // Map the word mask through the DWord mask. int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; if (std::equal(std::begin(MappedMask), std::end(MappedMask), std::begin(UnpackLoMask)) || std::equal(std::begin(MappedMask), std::end(MappedMask), std::begin(UnpackHiMask))) { // We can replace all three shuffles with an unpack. V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v8i16, V, V); } } } break; case X86ISD::PSHUFD: if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) return NewN; break; } return SDValue(); } /// \brief Try to combine a shuffle into a target-specific add-sub node. /// /// We combine this directly on the abstract vector shuffle nodes so it is /// easier to generically match. We also insert dummy vector shuffle nodes for /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); auto *SVN = cast(N); ArrayRef Mask = SVN->getMask(); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); // We require the first shuffle operand to be the SUB node, and the second to // be the ADD node. // FIXME: We should support the commuted patterns. if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) return SDValue(); // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) return SDValue(); // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. if (!(isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 0, 5, 2, 7) || isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) return SDValue(); // Only specific types are legal at this point, assert so we notice if and // when these change. assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && "Unknown vector type encountered!"); return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) return SDValue(); // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // // This code performs the following transformation: // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) -> // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, ) // // We do this only if both the bitcast and the BINOP dag nodes have // one use. Also, perform this transformation only if the new binary // operation is legal. This is to avoid introducing dag nodes that // potentially need to be further expanded (or custom lowered) into a // less optimal sequence of dag nodes. if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && N0.getOpcode() == ISD::BITCAST) { SDValue BC0 = N0.getOperand(0); EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { bool CanFold = false; switch (Opcode) { default : break; case ISD::ADD : case ISD::FADD : case ISD::SUB : case ISD::FSUB : case ISD::MUL : case ISD::FMUL : CanFold = true; } unsigned SVTNumElts = SVT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(N); for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) == (int)(i * 2); for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) < 0; if (CanFold) { SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); } } } // Only handle 128 wide vector from here on. if (!VT.is128BitVector()) return SDValue(); // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); if (LD.getNode()) return LD; if (isTargetShuffle(N->getOpcode())) { SDValue Shuffle = PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); if (Shuffle.getNode()) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle // instructions into higher-order shuffles. We do this after combining // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to /// a sequence of vector shuffle operations. /// It is possible when we truncate 256-bit vector to 128-bit vector static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { return SDValue(); } /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); if (!isa(EltNo)) return SDValue(); EVT OriginalVT = InVec.getValueType(); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); } EVT CurrentVT = InVec.getValueType(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); SmallVector ShuffleMask; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) return SDValue(); AllowedUses = 1; // only allow 1 load use if we have a bitcast LdNode = LdNode.getOperand(0); } if (!ISD::isNormalLoad(LdNode.getNode())) return SDValue(); LoadSDNode *LN0 = cast(LdNode); if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); EVT EltVT = N->getValueType(0); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( EltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : InVec.getOperand(1); Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// into a somewhat faster sequence. For i686, the best sequence is apparently /// storing the value and loading scalars back, while for x64 we should /// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); if (NewOp.getNode()) return NewOp; SDValue InputVector = N->getOperand(0); // Detect whether we are trying to convert from mmx to i32 and the bitcast // from mmx to v2i32 has a single usage. if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), N->getValueType(0), InputVector.getNode()->getOperand(0)); // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) return SDValue(); // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a // single use which is a sign-extend or zero-extend, and all elements are // used. SmallVector Uses; unsigned ExtractedElements = 0; for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() != InputVector.getResNo()) return SDValue(); SDNode *Extract = *UI; if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); if (Extract->getValueType(0) != MVT::i32) return SDValue(); if (!Extract->hasOneUse()) return SDValue(); if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); if (!isa(Extract->getOperand(1))) return SDValue(); // Record which element was extracted. ExtractedElements |= 1 << cast(Extract->getOperand(1))->getZExtValue(); Uses.push_back(Extract); } // If not all the elements were used, this may not be worthwhile. if (ExtractedElements != 15) return SDValue(); // Ok, we've now decided to do the transformation. // If 64-bit shifts are legal, use the extract-shift sequence, // otherwise bounce the vector off the cache. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vals[4]; SDLoc dl(InputVector); if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(0, VecIdxTy)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(1, VecIdxTy)); SDValue ShAmt = DAG.getConstant(32, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); } else { // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, MachinePointerInfo(), false, false, 0); EVT ElementType = InputVector.getValueType().getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; // Replace each use (extract) with a load of the appropriate element. for (unsigned i = 0; i < 4; ++i) { uint64_t Offset = EltSize * i; SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), StackPtr, OffsetVal); // Load the scalar. Vals[i] = DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo(), false, false, false, 0); } } // Replace the extracts for (SmallVectorImpl::iterator UI = Uses.begin(), UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; SDValue Idx = Extract->getOperand(1); uint64_t IdxVal = cast(Idx)->getZExtValue(); DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. return SDValue(); } /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. static std::pair matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const X86Subtarget *Subtarget) { if (!VT.isVector()) return std::make_pair(0, false); bool NeedSplit = false; switch (VT.getSimpleVT().SimpleTy) { default: return std::make_pair(0, false); case MVT::v4i64: case MVT::v2i64: if (!Subtarget->hasVLX()) return std::make_pair(0, false); break; case MVT::v64i8: case MVT::v32i16: if (!Subtarget->hasBWI()) return std::make_pair(0, false); break; case MVT::v16i32: case MVT::v8i64: if (!Subtarget->hasAVX512()) return std::make_pair(0, false); break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: if (!Subtarget->hasAVX2()) NeedSplit = true; if (!Subtarget->hasAVX()) return std::make_pair(0, false); break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: if (!Subtarget->hasSSE2()) return std::make_pair(0, false); } // SSE2 has only a small subset of the operations. bool hasUnsigned = Subtarget->hasSSE41() || (Subtarget->hasSSE2() && VT == MVT::v16i8); bool hasSigned = Subtarget->hasSSE41() || (Subtarget->hasSSE2() && VT == MVT::v8i16); ISD::CondCode CC = cast(Cond.getOperand(2))->get(); unsigned Opc = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: case ISD::SETULE: Opc = hasUnsigned ? X86ISD::UMIN : 0; break; case ISD::SETUGT: case ISD::SETUGE: Opc = hasUnsigned ? X86ISD::UMAX : 0; break; case ISD::SETLT: case ISD::SETLE: Opc = hasSigned ? X86ISD::SMIN : 0; break; case ISD::SETGT: case ISD::SETGE: Opc = hasSigned ? X86ISD::SMAX : 0; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETULT: case ISD::SETULE: Opc = hasUnsigned ? X86ISD::UMAX : 0; break; case ISD::SETUGT: case ISD::SETUGE: Opc = hasUnsigned ? X86ISD::UMIN : 0; break; case ISD::SETLT: case ISD::SETLE: Opc = hasSigned ? X86ISD::SMAX : 0; break; case ISD::SETGT: case ISD::SETGE: Opc = hasSigned ? X86ISD::SMIN : 0; break; } } return std::make_pair(Opc, NeedSplit); } static SDValue transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); if (Cond.getOpcode() == ISD::SIGN_EXTEND) { SDValue CondSrc = Cond->getOperand(0); if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) Cond = CondSrc->getOperand(0); } if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); unsigned MaskValue = 0; if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) return SDValue(); MVT VT = N->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); SmallVector ShuffleMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { // Be sure we emit undef where we can. if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) ShuffleMask[i] = -1; else ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) return SDValue(); return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); } /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom xhasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); unsigned Opcode = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; break; case ISD::SETULE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMIN; break; case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETUGT: // Converting this to a max would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETUGE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMAX; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETOGE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; break; case ISD::SETUGE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMIN; break; case ISD::SETULT: // Converting this to a max would handle NaNs incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETOLE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETULE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMAX; break; } } if (Opcode) return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } EVT CondVT = Cond.getValueType(); if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. // The same situation for all 128 and 256-bit vectors of i8 and i16. // Since SKX these selects have a proper lowering. EVT OpVT = LHS.getValueType(); if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16) && !(Subtarget->hasBWI() && Subtarget->hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); } } // If this is a select between two integer constants, try to do some // optimizations. if (ConstantSDNode *TrueC = dyn_cast(LHS)) { if (ConstantSDNode *FalseC = dyn_cast(RHS)) // Don't do this for crazy integer types. if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { // If this is efficiently invertible, canonicalize the LHSC/RHSC values // so that TrueC (the true value) is larger than FalseC. bool NeedsCondInvert = false; if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && // Efficiently invertible. (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. isa(Cond.getOperand(1))))) { NeedsCondInvert = true; std::swap(TrueC, FalseC); } // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, DAG.getConstant(ShAmt, MVT::i8)); } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; bool isFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } } } } // Canonicalize max and min: // (x > y) ? x : y -> (x >= y) ? x : y // (x < y) ? x : y -> (x <= y) ? x : y // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates // the need for an extra compare // against zero. e.g. // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 // subl %esi, %edi // testl %edi, %edi // movl $0, %eax // cmovgl %edi, %eax // => // xorl %eax, %eax // subl %esi, $edi // cmovsl %eax, %edi if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); switch (CC) { default: break; case ISD::SETLT: case ISD::SETGT: { ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); } } } // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); // Check if one of the arms of the VSELECT is a zero vector. If it's on the // left side invert the predicate to simplify logic below. SDValue Other; if (ISD::isBuildVectorAllZeros(LHS.getNode())) { Other = RHS; CC = ISD::getSetCCInverse(CC, true); } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { Other = LHS; } if (Other.getNode() && Other->getNumOperands() == 2 && DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); SDValue CondRHS = Cond->getOperand(1); // Look for a general sub with unsigned saturation first. // x >= y ? x-y : 0 --> subus x, y // x > y ? x-y : 0 --> subus x, y if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); if (auto *OpRHSBV = dyn_cast(OpRHS)) if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { if (auto *CondRHSBV = dyn_cast(CondRHS)) if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && CondRHSConst->getAPIntValue() == (-OpRHSConst->getAPIntValue() - 1)) return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. // FIXME: Would it be better to use computeKnownBits to determine // whether it's safe to decanonicalize the xor? // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && OpRHSConst->getAPIntValue().isSignBit()) // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); } } } // Try to match a min/max vector operation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { std::pair ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); unsigned Opc = ret.first; bool NeedSplit = ret.second; if (Opc && NeedSplit) { unsigned NumElems = VT.getVectorNumElements(); // Extract the LHS vectors SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); // Extract the RHS vectors SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); // Create min/max for each subvector LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); // Merge the result return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); } else if (Opc) return DAG.getNode(Opc, DL, VT, LHS, RHS); } // Simplify vector selection if condition value type matches vselect // operand type if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); // Try invert the condition if true value is not all 1s and false value // is not all 0s. if (!TValIsAllOnes && !FValIsAllZeros && // Check if the selector will be produced by CMPP*/PCMP* Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); ISD::CondCode NewCC = ISD::getSetCCInverse(cast(CC)->get(), Cond.getOperand(0).getValueType().isInteger()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); TValIsAllOnes = FValIsAllOnes; FValIsAllZeros = TValIsAllZeros; } } if (TValIsAllOnes || FValIsAllZeros) { SDValue Ret; if (TValIsAllOnes && FValIsAllZeros) Ret = Cond; else if (TValIsAllOnes) Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getNode(ISD::BITCAST, DL, CondVT, RHS)); else if (FValIsAllZeros) Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getNode(ISD::BITCAST, DL, CondVT, LHS)); return DAG.getNode(ISD::BITCAST, DL, VT, Ret); } } // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits // to simplify previous instructions. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && // We explicitly check against v8i16 and v16i16 because, although // they're marked as Custom, they might only be legal when Cond is a // build_vector of constants. This will be taken care in a later // condition. (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && VT != MVT::v8i16) && // Don't optimize vector of constants. Those are handled by // the generic code and all the bits must be properly set for // the generic optimizer. !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) { // If we changed the computation somewhere in the DAG, this change // will affect all users of Cond. // Make sure it is fine and update all the nodes so that we do not // use the generic VSELECT anymore. Otherwise, we may perform // wrong optimizations as we messed up with the actual expectation // for the vector boolean values. if (Cond != TLO.Old) { // Check all uses of that condition operand to check whether it will be // consumed by non-BLEND instructions, which may depend on all bits are // set properly. for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); I != E; ++I) if (I->getOpcode() != ISD::VSELECT) // TODO: Add other opcodes eventually lowered into BLEND. return SDValue(); // Update all the users of the condition, before committing the change, // so that the VSELECT optimizations that expect the correct vector // boolean value will not be triggered. for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); I != E; ++I) DAG.ReplaceAllUsesOfValueWith( SDValue(*I, 0), DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), Cond, I->getOperand(1), I->getOperand(2))); DCI.CommitTargetLoweringOpt(TLO); return SDValue(); } // At this point, only Cond is changed. Change the condition // just for N to keep the opportunity to optimize all other // users their own way. DAG.ReplaceAllUsesOfValueWith( SDValue(N, 0), DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), TLO.New, N->getOperand(1), N->getOperand(2))); return SDValue(); } } // We should generate an X86ISD::BLENDI from a vselect if its argument // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of // constants. This specific pattern gets generated when we split a // selector for a 512 bit vector in a machine without AVX512 (but with // 256-bit vectors), during legalization: // // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) // // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. if ((N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SHRUNKBLEND) && !DCI.isBeforeLegalize()) { SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; } return SDValue(); } // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. // // Simplify the following patterns: // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) // to (Op EFLAGS Cond) // // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) // to (Op EFLAGS !Cond) // // where Op could be BRCOND or CMOV. // static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Quit if not CMP and SUB with its value result used. if (Cmp.getOpcode() != X86ISD::CMP && (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) return SDValue(); // Quit if not used as a boolean value. if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); // Check CMP operands. One of them should be 0 or 1 and the other should be // an SetCC or extended from it. SDValue Op1 = Cmp.getOperand(0); SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast(Op1))) SetCC = Op2; else if ((C = dyn_cast(Op2))) SetCC = Op1; else // Quit if all operands are not constants. return SDValue(); if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; checkAgainstTrue = true; } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); bool truncatedToBoolWithAnd = false; // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; ConstantSDNode *CS; if ((CS = dyn_cast(SetCC.getOperand(0))) && CS->getZExtValue() == 1) OpIdx = 1; if ((CS = dyn_cast(SetCC.getOperand(1))) && CS->getZExtValue() == 1) OpIdx = 0; if (OpIdx == -1) break; SetCC = SetCC.getOperand(OpIdx); truncatedToBoolWithAnd = true; } else SetCC = SetCC.getOperand(0); } switch (SetCC.getOpcode()) { case X86ISD::SETCC_CARRY: // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, // i.e. it's a comparison against true but the result of SETCC_CARRY is not // truncated to i1 using 'and'. if (checkAgainstTrue && !truncatedToBoolWithAnd) break; assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!"); // FALL THROUGH case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(1); case X86ISD::CMOV: { // Check whether false/true value has canonical one, i.e. 0 or 1. ConstantSDNode *FVal = dyn_cast(SetCC.getOperand(0)); ConstantSDNode *TVal = dyn_cast(SetCC.getOperand(1)); // Quit if true value is not a constant. if (!TVal) return SDValue(); // Quit if false value is not a constant. if (!FVal) { SDValue Op = SetCC.getOperand(0); // Skip 'zext' or 'trunc' node. if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE) Op = Op.getOperand(0); // A special case for rdrand/rdseed, where 0 is set if false cond is // found. if ((Op.getOpcode() != X86ISD::RDRAND && Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. bool FValIsFalse = true; if (FVal && FVal->getZExtValue() != 0) { if (FVal->getZExtValue() != 1) return SDValue(); // If FVal is 1, opposite cond is needed. needOppositeCond = !needOppositeCond; FValIsFalse = false; } // Quit if TVal is not the constant opposite of FVal. if (FValIsFalse && TVal->getZExtValue() != 1) return SDValue(); if (!FValIsFalse && TVal->getZExtValue() != 0) return SDValue(); CC = X86::CondCode(SetCC.getConstantOperandVal(2)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(3); } } return SDValue(); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); // If the flag operand isn't dead, don't touch this CMOV. if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) return SDValue(); SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); if (CC == X86::COND_E || CC == X86::COND_NE) { switch (Cond.getOpcode()) { default: break; case X86ISD::BSR: case X86ISD::BSF: // If operand of BSR / BSF are proven never zero, then ZF cannot be set. if (DAG.isKnownNeverZero(Cond.getOperand(0))) return (CC == X86::COND_E) ? FalseOp : TrueOp; } } SDValue Flags; Flags = checkBoolTestSetCCCombine(Cond, CC); if (Flags.getNode() && // Extra check as FCMOV only supports a subset of X86 cond. (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); } // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. if (ConstantSDNode *TrueC = dyn_cast(TrueOp)) { if (ConstantSDNode *FalseC = dyn_cast(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, DAG.getConstant(ShAmt, MVT::i8)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; bool isFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } } } } // Handle these cases: // (select (x != c), e, c) -> select (x != c), e, x), // (select (x == c), c, e) -> select (x == c), x, e) // where the c is an integer constant, and the "select" is the combination // of CMOV and CMP. // // The rationale for this change is that the conditional-move from a constant // needs two instructions, however, conditional-move from a register needs // only one instruction. // // CAVEAT: By replacing a constant with a symbolic value, it may obscure // some instruction-combining opportunities. This opt needs to be // postponed as late as possible. // if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && !isa(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast(FalseOp)) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueOp, FalseOp); } if (CC == X86::COND_E && CmpAgainst == dyn_cast(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, MVT::i8), Cond }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); } } } return SDValue(); } static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: return SDValue(); // SSE/AVX/AVX2 blend intrinsics. case Intrinsic::x86_avx2_pblendvb: case Intrinsic::x86_avx2_pblendw: case Intrinsic::x86_avx2_pblendd_128: case Intrinsic::x86_avx2_pblendd_256: // Don't try to simplify this intrinsic if we don't have AVX2. if (!Subtarget->hasAVX2()) return SDValue(); // FALL-THROUGH case Intrinsic::x86_avx_blend_pd_256: case Intrinsic::x86_avx_blend_ps_256: case Intrinsic::x86_avx_blendv_pd_256: case Intrinsic::x86_avx_blendv_ps_256: // Don't try to simplify this intrinsic if we don't have AVX. if (!Subtarget->hasAVX()) return SDValue(); // FALL-THROUGH case Intrinsic::x86_sse41_pblendw: case Intrinsic::x86_sse41_blendpd: case Intrinsic::x86_sse41_blendps: case Intrinsic::x86_sse41_blendvps: case Intrinsic::x86_sse41_blendvpd: case Intrinsic::x86_sse41_pblendvb: { SDValue Op0 = N->getOperand(1); SDValue Op1 = N->getOperand(2); SDValue Mask = N->getOperand(3); // Don't try to simplify this intrinsic if we don't have SSE4.1. if (!Subtarget->hasSSE41()) return SDValue(); // fold (blend A, A, Mask) -> A if (Op0 == Op1) return Op0; // fold (blend A, B, allZeros) -> A if (ISD::isBuildVectorAllZeros(Mask.getNode())) return Op0; // fold (blend A, B, allOnes) -> B if (ISD::isBuildVectorAllOnes(Mask.getNode())) return Op1; // Simplify the case where the mask is a constant i32 value. if (ConstantSDNode *C = dyn_cast(Mask)) { if (C->isNullValue()) return Op0; if (C->isAllOnesValue()) return Op1; } return SDValue(); } // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_avx2_psrai_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_sse2_psra_d: case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psra_d: { SDValue Op0 = N->getOperand(1); SDValue Op1 = N->getOperand(2); EVT VT = Op0.getValueType(); assert(VT.isVector() && "Expected a vector type!"); if (isa(Op1)) Op1 = Op1.getOperand(0); if (!isa(Op1)) return SDValue(); EVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); ConstantSDNode *CND = cast(Op1); const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); // Don't try to convert this shift into a ISD::SRA if the shift // count is bigger than or equal to the element size. if (ShAmt >= SVTBits) return SDValue(); // Trivial case: if the shift count is zero, then fold this // into the first operand. if (ShAmt == 0) return Op0; // Replace this packed shift intrinsic with a target independent // shift dag node. SDValue Splat = DAG.getConstant(C, VT); return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); } } } /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i64) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); uint64_t MulAmt = C->getZExtValue(); if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) return SDValue(); uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; if ((MulAmt % 9) == 0) { MulAmt1 = 9; MulAmt2 = MulAmt / 9; } else if ((MulAmt % 5) == 0) { MulAmt1 = 5; MulAmt2 = MulAmt / 5; } else if ((MulAmt % 3) == 0) { MulAmt1 = 3; MulAmt2 = MulAmt / 3; } if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) // If second multiplifer is pow2, issue it first. We want the multiply by // 3, 5, or 9 to be folded into the addressing mode unless the lone use // is an add. std::swap(MulAmt1, MulAmt2); SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(MulAmt1, VT)); if (isPowerOf2_64(MulAmt2)) NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, VT)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); } return SDValue(); } static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY || ((N00.getOpcode() == ISD::ANY_EXTEND || N00.getOpcode() == ISD::ZERO_EXTEND) && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); APInt ShAmt = N1C->getAPIntValue(); Mask = Mask.shl(ShAmt); if (Mask != 0) return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, DAG.getConstant(Mask, VT)); } } // Hardware support for vector shifts is sparse which makes us scalarize the // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V if (auto *N1BV = dyn_cast(N1)) if (auto *N1SplatC = N1BV->getConstantSplatNode()) { assert(N0.getValueType().isVector() && "Invalid vector shift type"); // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. if (N1SplatC->getZExtValue() == 1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } return SDValue(); } /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && (!Subtarget->hasInt256() || (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); SDValue Amt = N->getOperand(1); SDLoc DL(N); if (auto *AmtBV = dyn_cast(Amt)) if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { APInt ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s // if the shift amount is bigger than or equal to // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) return getZeroVector(VT, Subtarget, DAG, DL); } return SDValue(); } /// PerformShiftCombine - Combine shifts. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (N->getOpcode() == ISD::SHL) { SDValue V = PerformSHLCombine(N, DAG); if (V.getNode()) return V; } if (N->getOpcode() != ISD::SRA) { // Try to fold this logical shift into a zero vector. SDValue V = performShiftToAllZeros(N, DAG, Subtarget); if (V.getNode()) return V; } return SDValue(); } // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) // where both setccs reference the same FP CMP, and rewrite for CMPEQSS // and friends. Likewise for OR -> CMPNEQSS. static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { unsigned opcode; // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); SDValue CMP1 = N1->getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) return SDValue(); SDValue CMP00 = CMP0->getOperand(0); SDValue CMP01 = CMP0->getOperand(1); EVT VT = CMP00.getValueType(); if (VT == MVT::f32 || VT == MVT::f64) { bool ExpectingFlags = false; // Check for any users that want flags: for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); !ExpectingFlags && UI != UE; ++UI) switch (UI->getOpcode()) { default: case ISD::BR_CC: case ISD::BRCOND: case ISD::SELECT: ExpectingFlags = true; break; case ISD::CopyToReg: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; } if (!ExpectingFlags) { enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { X86::CondCode tmp = cc0; cc0 = cc1; cc1 = tmp; } if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget->hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, CMP01, DAG.getConstant(x86cc, MVT::i8)); if (N->getValueType(0) != MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), FSetCC); return FSetCC; } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, DAG.getConstant(x86cc, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; if (is64BitFP && !Subtarget->is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since // OnesOrZeroesF is all ones of all zeroes, we don't need all the // bits, but can do this little dance to extract the lowest 32 bits // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, OnesOrZeroesF); SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32, DAG.getIntPtrConstant(0)); IntVT = MVT::i32; } SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } } } } return SDValue(); } /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector /// so it can be folded inside ANDNP. static bool CanFoldXORWithAllOnes(const SDNode *N) { EVT VT = N->getValueType(0); // Match direct AllOnes for 128 and 256-bit vectors if (ISD::isBuildVectorAllOnes(N)) return true; // Look through a bit convert. if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); // Sometimes the operand may come from a insert_subvector building a 256-bit // allones vector if (VT.is256BitVector() && N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && V1.getOperand(0).getOpcode() == ISD::UNDEF && ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && ISD::isBuildVectorAllOnes(V2.getNode())) return true; } return false; } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.is256BitVector()) return SDValue(); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N->getOperand(0); EVT NarrowVT = Narrow->getValueType(0); if (!NarrowVT.is128BitVector()) return SDValue(); if (Narrow->getOpcode() != ISD::XOR && Narrow->getOpcode() != ISD::AND && Narrow->getOpcode() != ISD::OR) return SDValue(); SDValue N0 = Narrow->getOperand(0); SDValue N1 = Narrow->getOperand(1); SDLoc DL(Narrow); // The Left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) return SDValue(); // The type of the truncated inputs. EVT WideVT = N0->getOperand(0)->getValueType(0); if (WideVT != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; ConstantSDNode *RHSConstSplat = nullptr; if (auto *RHSBV = dyn_cast(N1)) RHSConstSplat = RHSBV->getConstantSplatNode(); if (!RHSTrunc && !RHSConstSplat) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), SDValue(RHSConstSplat, 0)); SmallVector C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { N1 = N1->getOperand(0); } // Generate the wide operation. SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); unsigned Opcode = N->getOpcode(); switch (Opcode) { case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: { unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); APInt Mask = APInt::getAllOnesValue(InBits); Mask = Mask.zext(VT.getScalarType().getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(Mask, VT)); } case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); default: llvm_unreachable("Unexpected opcode"); } } static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); if (R.getNode()) return R; // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { ConstantSDNode *MaskNode = dyn_cast(N1); ConstantSDNode *ShiftNode = dyn_cast(N0.getOperand(1)); if (MaskNode && ShiftNode) { uint64_t Mask = MaskNode->getZExtValue(); uint64_t Shift = ShiftNode->getZExtValue(); if (isMask_64(Mask)) { uint64_t MaskSize = CountPopulation_64(Mask); if (Shift + MaskSize <= VT.getSizeInBits()) return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), DAG.getConstant(Shift | (MaskSize << 8), VT)); } } } // BEXTR return SDValue(); } // Want to form ANDNP nodes: // 1) In the hopes of then easily combining them with OR and AND nodes // to form PBLEND/PSIGN. // 2) To match ANDN packed intrinsics if (VT != MVT::v2i64 && VT != MVT::v4i64) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); // Check RHS for vnot if (N1.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); return SDValue(); } static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); if (R.getNode()) return R; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // look for psign/blend if (VT == MVT::v2i64 || VT == MVT::v4i64) { if (!Subtarget->hasSSSE3() || (VT == MVT::v4i64 && !Subtarget->hasInt256())) return SDValue(); // Canonicalize pandn to RHS if (N0.getOpcode() == X86ISD::ANDNP) std::swap(N0, N1); // or (and (m, y), (pandn m, x)) if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { SDValue Mask = N1.getOperand(0); SDValue X = N1.getOperand(1); SDValue Y; if (N0.getOperand(0) == Mask) Y = N0.getOperand(1); if (N0.getOperand(1) == Mask) Y = N0.getOperand(0); // Check to see if the mask appeared in both the AND and ANDNP and if (!Y.getNode()) return SDValue(); // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. // Look through mask bitcast. if (Mask.getOpcode() == ISD::BITCAST) Mask = Mask.getOperand(0); if (X.getOpcode() == ISD::BITCAST) X = X.getOperand(0); if (Y.getOpcode() == ISD::BITCAST) Y = Y.getOperand(0); EVT MaskVT = Mask.getValueType(); // Validate that the Mask operand is a vector sra node. // FIXME: what to do for bytes, since there is a psignb/pblendvb, but // there is no psrai.b unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { if (auto *AmtBV = dyn_cast(Mask.getOperand(1))) if (auto *AmtConst = AmtBV->getConstantSplatNode()) SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { SDValue SraC = Mask.getOperand(1); SraAmt = cast(SraC)->getZExtValue(); } if ((SraAmt + 1) != EltBits) return SDValue(); SDLoc DL(N); // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } // PBLENDVB only available on SSE 4.1 if (!Subtarget->hasSSE41()) return SDValue(); EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } } if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. if (!OptForSize && Subtarget->isSHLDSlow()) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SDValue ShAmt0 = N0.getOperand(1); if (ShAmt0.getValueType() != MVT::i8) return SDValue(); SDValue ShAmt1 = N1.getOperand(1); if (ShAmt1.getValueType() != MVT::i8) return SDValue(); if (ShAmt0.getOpcode() == ISD::TRUNCATE) ShAmt0 = ShAmt0.getOperand(0); if (ShAmt1.getOpcode() == ISD::TRUNCATE) ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); unsigned Opc = X86ISD::SHLD; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); if (ShAmt0.getOpcode() == ISD::SUB) { Opc = X86ISD::SHRD; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); } unsigned Bits = VT.getSizeInBits(); if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (ConstantSDNode *SumC = dyn_cast(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) return DAG.getNode(Opc, DL, VT, Op0, Op1, DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } } else if (ConstantSDNode *ShAmt1C = dyn_cast(ShAmt1)) { ConstantSDNode *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } return SDValue(); } // Generate NEG and CMOV for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); // Since X86 does not have CMOV for 8-bit integer, we don't convert // 8-bit integer abs to NEG and CMOV. if (VT.isInteger() && VT.getSizeInBits() == 8) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CMOV. if (VT.isInteger() && N->getOpcode() == ISD::XOR && N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { // Generate SUB & CMOV. SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, VT), N0.getOperand(0)); SDValue Ops[] = { N0.getOperand(0), Neg, DAG.getConstant(X86::COND_GE, MVT::i8), SDValue(Neg.getNode(), 1) }; return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); } return SDValue(); } // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); if (Subtarget->hasCMov()) { SDValue RV = performIntegerAbsCombine(N, DAG); if (RV.getNode()) return RV; } return SDValue(); } /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { LoadSDNode *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Ptr = Ld->getBasePtr(); SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Alignment); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), std::min(16U, Alignment)); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getUNDEF(RegVT); NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); return DCI.CombineTo(N, NewVec, TF, true); } return SDValue(); } +/// PerformMLOADCombine - Resolve extending loads +static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + MaskedLoadSDNode *Mld = cast(N); + if (Mld->getExtensionType() != ISD::SEXTLOAD) + return SDValue(); + + EVT VT = Mld->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + EVT LdVT = Mld->getMemoryVT(); + SDLoc dl(Mld); + + assert(LdVT != VT && "Cannot extend to the same type"); + unsigned ToSz = VT.getVectorElementType().getSizeInBits(); + unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for extending masked load"); + + unsigned SizeRatio = ToSz / FromSz; + assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + LdVT.getScalarType(), NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + // Convert Src0 value + SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0()); + if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal"); + WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + } + // Prepare the new mask + SDValue NewMask; + SDValue Mask = Mld->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), + Mld->getBasePtr(), NewMask, WideSrc0, + Mld->getMemoryVT(), Mld->getMemOperand(), + ISD::NON_EXTLOAD); + SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + +} +/// PerformMSTORECombine - Resolve truncating stores +static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MaskedStoreSDNode *Mst = cast(N); + if (!Mst->isTruncatingStore()) + return SDValue(); + + EVT VT = Mst->getValue().getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for truncating masked store"); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + assert (((NumElems * FromSz) % ToSz) == 0 && + "Unexpected ratio for truncating masked store"); + + unsigned SizeRatio = FromSz / ToSz; + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue()); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal"); + + SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + SDValue NewMask; + SDValue Mask = Mst->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), + NewMask, StVT, Mst->getMemOperand(), false); +} /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { StoreSDNode *St = cast(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. unsigned Alignment = St->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), Alignment); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), std::min(16U, Alignment)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromSz) % ToSz) return SDValue(); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && (64 <= NumElems * ToSz)) StoreType = MVT::f64; // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); SmallVector Chains; SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, TLI.getPointerTy()); SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i)); SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. if (VT.getSizeInBits() != 64) return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { SDNode* LdVal = St->getValue().getNode(); LoadSDNode *Ld = nullptr; int TokenFactorIndex = -1; SmallVector Ops; SDNode* ChainVal = St->getChain().getNode(); // Must be a store of a load. We currently handle two cases: the load // is a direct child, and it's under an intervening TokenFactor. It is // possible to dig deeper under nested TokenFactors. if (ChainVal == LdVal) Ld = cast(St->getChain()); else if (St->getValue().hasOneUse() && ChainVal->getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { if (ChainVal->getOperand(i).getNode() == LdVal) { TokenFactorIndex = i; Ld = cast(St->getValue()); } else Ops.push_back(ChainVal->getOperand(i)); } } if (!Ld || !ISD::isNormalLoad(Ld)) return SDValue(); // If this is not the MMX case, i.e. we are just turning i64 load/store // into f64 load/store, avoid the transformation if there are multiple // uses of the loaded value. if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget->is64Bit() || F64IsLegal) { EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); } // Otherwise, lower to two pairs of 32-bit loads / stores. SDValue LoAddr = Ld->getBasePtr(); SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, DAG.getConstant(4, MVT::i32)); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), MinAlign(Ld->getAlignment(), 4)); SDValue NewChain = LoLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(LoLd); Ops.push_back(HiLd); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } LoAddr = St->getBasePtr(); HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, DAG.getConstant(4, MVT::i32)); SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4), St->isVolatile(), St->isNonTemporal(), MinAlign(St->getAlignment(), 4)); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } return SDValue(); } /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, /// returning the resulting values in a vector. For example, if /// A = < float a0, float a1, float a2, float a3 > /// and /// B = < float b0, float b1, float b2, float b3 > /// then the result of doing a horizontal operation on A and B is /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > // and // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. // At least one of the operands should be a vector shuffle. if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to // operate independently on 128-bit lanes. unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts / NumLanes; assert((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"); unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle then pretend it is the shuffle // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; SmallVector LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) A = LHS.getOperand(0); if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) B = LHS.getOperand(1); ArrayRef Mask = cast(LHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; SmallVector RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) C = RHS.getOperand(0); if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) D = RHS.getOperand(1); ArrayRef Mask = cast(RHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } // Check that the shuffles are both shuffling the same vectors. if (!(A == C && B == D) && !(A == D && B == C)) return false; // If everything is UNDEF then bail out: it would be better to fold to UNDEF. if (!A.getNode() && !B.getNode()) return false; // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) CommuteVectorShuffleMask(RMask, NumElts); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0; i != NumLaneElts; ++i) { int LIdx = LMask[i+l], RIdx = RMask[i+l]; // Ignore any UNDEF components. if (LIdx < 0 || RIdx < 0 || (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. unsigned Src = (i/HalfLaneElts); // each lane is split between srcs int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. return true; } /// Do target-specific dag combines on floating point adds. static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); return SDValue(); } /// Do target-specific dag combines on floating point subs. static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); return SDValue(); } /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); return SDValue(); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); // Only perform optimizations if UnsafeMath is used. if (!DAG.getTarget().Options.UnsafeFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); return SDValue(); } /// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { // FANDN(x, 0.0) -> 0.0 // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); return SDValue(); } static SDValue PerformBTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // BT ignores high bits in the bit index operand. SDValue Op1 = N->getOperand(1); if (Op1.hasOneUse()) { unsigned BitWidth = Op1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); } return SDValue(); } static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { SDValue Op = N->getOperand(0); if (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); EVT VT = N->getValueType(0), OpVT = Op.getValueType(); if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } return SDValue(); } static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right // operation on a vector with 64-bit elements. //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } } return SDValue(); } static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) // This exposes the sext to the sdivrem lowering, so that it directly extends // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && VT == MVT::i32) { SDLoc dl(N); SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); return R.getValue(1); } if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); if (VT.isVector() && VT.getSizeInBits() == 256) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) return R; } return SDValue(); } static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget* Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) return SDValue(); SDValue A = N->getOperand(0); SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); bool NegA = (A.getOpcode() == ISD::FNEG); bool NegB = (B.getOpcode() == ISD::FNEG); bool NegC = (C.getOpcode() == ISD::FNEG); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); if (NegA) A = A.getOperand(0); if (NegB) B = B.getOperand(0); if (NegC) C = C.getOperand(0); unsigned Opcode; if (!NegMul) Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; return DAG.getNode(Opcode, dl, VT, A, B, C); } static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because // ISD::SETCC is always legalized to i8. SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 1) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } } if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } } if (VT.is256BitVector()) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) return R; } // (i8,i32 zext (udivrem (i8 x, i8 y)) -> // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) // This exposes the zext to the udivrem lowering, so that it directly extends // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::UDIVREM && N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && (VT == MVT::i32 || VT == MVT::i64)) { SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); return R.getValue(1); } return SDValue(); } // Optimize x == -y --> x+y == 0 // x != -y --> x+y != 0 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget* Subtarget) { ISD::CondCode CC = cast(N->getOperand(2))->get(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, LHS.getOperand(1)); return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, RHS.getOperand(1)); return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } if (VT.getScalarType() == MVT::i1) { bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode()); if (!IsSEXT0 && !IsVZero0) return SDValue(); bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) && (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); if (!IsSEXT1 && !IsVZero1) return SDValue(); if (IsSEXT0 && IsVZero1) { assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); if (CC == ISD::SETEQ) return DAG.getNOT(DL, LHS.getOperand(0), VT); return LHS.getOperand(0); } if (IsSEXT1 && IsVZero0) { assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"); if (CC == ISD::SETEQ) return DAG.getNOT(DL, RHS.getOperand(0), VT); return RHS.getOperand(0); } } return SDValue(); } static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); MVT VT = N->getOperand(1)->getSimpleValueType(0); assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"); SDValue Ld = N->getOperand(1); if (MayFoldLoad(Ld)) { // Extract the countS bits from the immediate so we can get the proper // address when narrowing the vector load to a specific element. // When the second source op is a memory address, interps doesn't use // countS and just gets an f32 from that address. unsigned DestIndex = cast(N->getOperand(2))->getZExtValue() >> 6; Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); } else return SDValue(); // Create this as a scalar to vector to match the instruction pattern. SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); // countS bits are ignored when loading from memory on insertps, which // means we don't need to explicitly set them to 0. return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), LoadScalarToVector, N->getOperand(2)); } // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, MVT VT) { if (VT == MVT::i8) return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), DAG.getConstant(1, VT)); assert (VT == MVT::i1 && "Unexpected type for SECCC node"); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS)); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); if (CC == X86::COND_A) { // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); } } // Materialize "setb reg" as "sbb reg,reg", since it can be extended without // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. if (CC == X86::COND_B) return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); SDValue Flags; Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } return SDValue(); } // Optimize branch condition evaluation. // static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); SDValue Flags; Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, Flags); } return SDValue(); } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); return Res; } return SDValue(); } static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); if (Res != SDValue()) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0); // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { SDLoc dl(N); MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); EVT VT = Ld->getValueType(0); if (!Ld->isVolatile() && !N->getValueType(0).isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !XTLI->getSubtarget()->is64Bit() && VT == MVT::i64) { SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } } return SDValue(); } // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. if (X86::isZeroNode(N->getOperand(0)) && X86::isZeroNode(N->getOperand(1)) && // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B,MVT::i8), N->getOperand(2)), DAG.getConstant(1, VT)); return DCI.CombineTo(N, Res1, CarryOut); } return SDValue(); } // fold (add Y, (sete X, 0)) -> adc 0, Y // (add Y, (setne X, 0)) -> sbb -1, Y // (sub (sete X, 0), Y) -> sbb 0, Y // (sub (setne X, 0), Y) -> adc -1, Y static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // Look through ZExts. SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) return SDValue(); SDValue SetCC = Ext.getOperand(0); if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) return SDValue(); X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); SDValue Cmp = SetCC.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); SDValue CmpOp0 = Cmp.getOperand(0); SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); if (CC == X86::COND_NE) return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, DL, OtherVal.getValueType(), OtherVal, DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, DL, OtherVal.getValueType(), OtherVal, DAG.getConstant(0, OtherVal.getValueType()), NewCmp); } /// PerformADDCombine - Do target-specific dag combines on integer adds. static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // X86 can't encode an immediate LHS of a sub. See if we can push the // negation into a preceding instruction. if (ConstantSDNode *C = dyn_cast(Op0)) { // If the RHS of the sub is a XOR with one use and a constant, invert the // immediate. Then add one to the LHS of the sub so we can turn // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa(Op1.getOperand(1))) { APInt XorC = cast(Op1.getOperand(1))->getAPIntValue(); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getConstant(~XorC, VT)); return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, DAG.getConstant(C->getAPIntValue()+1, VT)); } } // Try to synthesize horizontal adds from adds of shuffles. EVT VT = N->getValueType(0); if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); MVT VT = N->getSimpleValueType(0); SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); // (vzext (bitcast (vzext (x)) -> (vzext x) SDValue V = Op; while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V != Op && V.getOpcode() == X86ISD::VZEXT) { MVT InnerVT = V.getSimpleValueType(); MVT InnerEltVT = InnerVT.getVectorElementType(); // If the element sizes match exactly, we can just do one larger vzext. This // is always an exact type match as vzext operates on integer types. if (OpEltVT == InnerEltVT) { assert(OpVT == InnerVT && "Types must match for vzext!"); return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); } // The only other way we can combine them is if only a single element of the // inner vzext is used in the input to the outer vzext. if (InnerEltVT.getSizeInBits() < InputBits) return SDValue(); // In this case, the inner vzext is completely dead because we're going to // only look at bits inside of the low element. Just do the outer vzext on // a bitcast of the input to the inner. return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getNode(ISD::BITCAST, DL, OpVT, V)); } // Check if we can bypass extracting and re-inserting an element of an input // vector. Essentialy: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); if (auto *ExtractIdx = dyn_cast(ExtractedV.getOperand(1))) if (ExtractIdx->getZExtValue() == 0) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), OrigVT.getVectorNumElements() / Ratio); OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, DAG.getIntPtrConstant(0)); } Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); } } return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); + case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); case X86ISD::INSERTPS: return PerformINSERTPSCombine(N, DAG, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); } return SDValue(); } /// isTypeDesirableForOp - Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 /// instruction encodings are longer and some i16 instructions are slow. bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; if (VT != MVT::i16) return true; switch (Opc) { default: return true; case ISD::LOAD: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: case ISD::SRL: case ISD::SUB: case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: return false; } } /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); if (VT != MVT::i16) return false; bool Promote = false; bool Commute = false; switch (Op.getOpcode()) { default: break; case ISD::LOAD: { LoadSDNode *LD = cast(Op); // If the non-extending load has a single use and it's not live out, then it // might be folded. if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& Op.hasOneUse()*/) { for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) { // The only case where we'd want to promote LOAD (rather then it being // promoted as an operand is when it's only use is liveout. if (UI->getOpcode() != ISD::CopyToReg) return false; } } Promote = true; break; } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Promote = true; break; case ISD::SHL: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) return false; Promote = true; break; } case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: Commute = true; // fallthrough case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); if (!Commute && MayFoldLoad(N1)) return false; // Avoid disabling potential load folding opportunities. if (MayFoldLoad(N0) && (!isa(N1) || MayFoldIntoStore(Op))) return false; if (MayFoldLoad(N1) && (!isa(N0) || MayFoldIntoStore(Op))) return false; Promote = true; } } PVT = MVT::i32; return Promote; } //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// namespace { // Helper to match a string separated by whitespace. bool matchAsmImpl(StringRef s, ArrayRef args) { s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. for (unsigned i = 0, e = args.size(); i != e; ++i) { StringRef piece(*args[i]); if (!s.startswith(piece)) // Check if the piece matches. return false; s = s.substr(piece.size()); StringRef::size_type pos = s.find_first_not_of(" \t"); if (pos == 0) // We matched a prefix. return false; s = s.substr(pos); } return s.empty(); } const VariadicFunction1 matchAsm={}; } static bool clobbersFlagRegisters(const SmallVector &AsmPieces) { if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { if (AsmPieces.size() == 3) return true; else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) return true; } } return false; } bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); IntegerType *Ty = dyn_cast(CI->getType()); if (!Ty || Ty->getBitWidth() % 16 != 0) return false; // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 if (matchAsm(AsmPieces[0], "bswap", "$0") || matchAsm(AsmPieces[0], "bswapl", "$0") || matchAsm(AsmPieces[0], "bswapq", "$0") || matchAsm(AsmPieces[0], "bswap", "${0:q}") || matchAsm(AsmPieces[0], "bswapl", "${0:q}") || matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); if (Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 if (matchAsm(AsmPieces[0], "bswap", "%eax") && matchAsm(AsmPieces[1], "bswap", "%edx") && matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) return IntrinsicLowering::LowerToByteSwap(CI); } } break; } return false; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': case 'q': case 'Q': case 'f': case 't': case 'u': case 'y': case 'x': case 'Y': case 'l': return C_RegisterClass; case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': return C_Register; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'G': case 'C': case 'e': case 'Z': return C_Other; default: break; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight X86TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); case 'R': case 'q': case 'Q': case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': if (CallOperandVal->getType()->isIntegerTy()) weight = CW_SpecificReg; break; case 'f': case 't': case 'u': if (type->isFloatingPointTy()) weight = CW_SpecificReg; break; case 'y': if (type->isX86_MMXTy() && Subtarget->hasMMX()) weight = CW_SpecificReg; break; case 'x': case 'Y': if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) weight = CW_Register; break; case 'I': if (ConstantInt *C = dyn_cast(info.CallOperandVal)) { if (C->getZExtValue() <= 31) weight = CW_Constant; } break; case 'J': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 63) weight = CW_Constant; } break; case 'K': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) weight = CW_Constant; } break; case 'L': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) weight = CW_Constant; } break; case 'M': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 3) weight = CW_Constant; } break; case 'N': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xff) weight = CW_Constant; } break; case 'G': case 'C': if (dyn_cast(CallOperandVal)) { weight = CW_Constant; } break; case 'e': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80000000LL) && (C->getSExtValue() <= 0x7fffffffLL)) weight = CW_Constant; } break; case 'Z': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xffffffff) weight = CW_Constant; } break; } return weight; } /// LowerXConstraint - try to replace an X constraint, which matches anything, /// with another that has more specific requirements based on the type of the /// corresponding operand. const char *X86TargetLowering:: LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { if (Subtarget->hasSSE2()) return "Y"; if (Subtarget->hasSSE1()) return "x"; } return TargetLowering::LowerXConstraint(ConstraintVT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } return; case 'J': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 63) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } return; case 'K': if (ConstantSDNode *C = dyn_cast(Op)) { if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } return; case 'L': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType()); break; } } return; case 'M': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 3) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } return; case 'N': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 255) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } return; case 'O': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 127) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. } return; } case 'Z': { // 32-bit unsigned value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. return; } case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast(Op)) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); break; } // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't // be used as immediates. if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) return; // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. GlobalAddressSDNode *GA = nullptr; int64_t Offset = 0; // Match either (GA), (GA+C), (GA+C1+C2), etc. while (1) { if ((GA = dyn_cast(Op))) { Offset += GA->getOffset(); break; } else if (Op.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Offset += C->getZExtValue(); Op = Op.getOperand(0); continue; } } else if (Op.getOpcode() == ISD::SUB) { if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Offset += -C->getZExtValue(); Op = Op.getOperand(0); continue; } } // Otherwise, this isn't something we can handle, reject it. return; } const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. if (isGlobalStubReference( Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) return; Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), GA->getValueType(0), Offset); break; } } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } std::pair X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { default: break; // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget->is64Bit()) { if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i64 || VT == MVT::f64) return std::make_pair(0U, &X86::GR64RegClass); break; } // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32_ABCDRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_ABCDRegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); if (VT == MVT::i64) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) return std::make_pair(0U, &X86::GR32RegClass); return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || !Subtarget->is64Bit()) return std::make_pair(0U, &X86::GR32_NOREXRegClass); return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP64RegClass); return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. if (!Subtarget->hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget->hasSSE2()) break; // FALL THROUGH. case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget->hasSSE1()) break; switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: case MVT::i32: return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); // Vector types. case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: return std::make_pair(0U, &X86::VR256RegClass); case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: return std::make_pair(0U, &X86::VR512RegClass); } break; } } // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); // Not found as a standard register? if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && Constraint[3] == '(' && (Constraint[4] >= '0' && Constraint[4] <= '7') && Constraint[5] == ')' && Constraint[6] == '}') { Res.first = X86::FP0+Constraint[4]-'0'; Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { Res.first = X86::FP0; Res.second = &X86::RFP80RegClass; return Res; } // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) { Res.first = X86::EFLAGS; Res.second = &X86::CCRRegClass; return Res; } // 'A' means EAX + EDX. if (Constraint == "A") { Res.first = X86::EAX; Res.second = &X86::GR32_ADRegClass; return Res; } return Res; } // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. if (Res.second->hasType(VT)) return Res; // Correct type already, nothing to do. // All of the single-register GCC register classes map their values onto // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we // really want an 8-bit or 32-bit register, map to the appropriate register // class and return the appropriate register. if (Res.second == &X86::GR16RegClass) { if (VT == MVT::i8 || VT == MVT::i1) { unsigned DestReg = 0; switch (Res.first) { default: break; case X86::AX: DestReg = X86::AL; break; case X86::DX: DestReg = X86::DL; break; case X86::CX: DestReg = X86::CL; break; case X86::BX: DestReg = X86::BL; break; } if (DestReg) { Res.first = DestReg; Res.second = &X86::GR8RegClass; } } else if (VT == MVT::i32 || VT == MVT::f32) { unsigned DestReg = 0; switch (Res.first) { default: break; case X86::AX: DestReg = X86::EAX; break; case X86::DX: DestReg = X86::EDX; break; case X86::CX: DestReg = X86::ECX; break; case X86::BX: DestReg = X86::EBX; break; case X86::SI: DestReg = X86::ESI; break; case X86::DI: DestReg = X86::EDI; break; case X86::BP: DestReg = X86::EBP; break; case X86::SP: DestReg = X86::ESP; break; } if (DestReg) { Res.first = DestReg; Res.second = &X86::GR32RegClass; } } else if (VT == MVT::i64 || VT == MVT::f64) { unsigned DestReg = 0; switch (Res.first) { default: break; case X86::AX: DestReg = X86::RAX; break; case X86::DX: DestReg = X86::RDX; break; case X86::CX: DestReg = X86::RCX; break; case X86::BX: DestReg = X86::RBX; break; case X86::SI: DestReg = X86::RSI; break; case X86::DI: DestReg = X86::RDI; break; case X86::BP: DestReg = X86::RBP; break; case X86::SP: DestReg = X86::RSP; break; } if (DestReg) { Res.first = DestReg; Res.second = &X86::GR64RegClass; } } } else if (Res.second == &X86::FR32RegClass || Res.second == &X86::FR64RegClass || Res.second == &X86::VR128RegClass || Res.second == &X86::VR256RegClass || Res.second == &X86::FR32XRegClass || Res.second == &X86::FR64XRegClass || Res.second == &X86::VR128XRegClass || Res.second == &X86::VR256XRegClass || Res.second == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64RegClass; else if (X86::VR128RegClass.hasType(VT)) Res.second = &X86::VR128RegClass; else if (X86::VR256RegClass.hasType(VT)) Res.second = &X86::VR256RegClass; else if (X86::VR512RegClass.hasType(VT)) Res.second = &X86::VR512RegClass; } return Res; } int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, Type *Ty) const { // Scaling factors are not free at all. // An indexed folded instruction, i.e., inst (reg1, reg2, scale), // will take 2 allocations in the out of order engine instead of 1 // for plain addressing mode, i.e. inst (reg1). // E.g., // vaddps (%rsi,%drx), %ymm0, %ymm1 // Requires two allocations (one for the load, one for the computation) // whereas: // vaddps (%rsi), %ymm0, %ymm1 // Requires just 1 allocation, i.e., freeing allocations for other operations // and having less micro operations to execute. // // For some X86 architectures, this is even worse because for instance for // stores, the complex addressing mode forces the instruction to use the // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(AM, Ty)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. return AM.Scale != 0; return -1; } bool X86TargetLowering::isTargetFTOL() const { return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); } Index: projects/clang360-import/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp (revision 279025) @@ -1,599 +1,601 @@ //===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // //! \file //! \brief This pass performs merges of loads and stores on both sides of a // diamond (hammock). It hoists the loads and sinks the stores. // // The algorithm iteratively hoists two loads to the same address out of a // diamond (hammock) and merges them into a single load in the header. Similar // it sinks and merges two stores to the tail block (footer). The algorithm // iterates over the instructions of one side of the diamond and attempts to // find a matching load/store on the other side. It hoists / sinks when it // thinks it safe to do so. This optimization helps with eg. hiding load // latencies, triggering if-conversion, and reducing static code size. // //===----------------------------------------------------------------------===// // // // Example: // Diamond shaped code before merge: // // header: // br %cond, label %if.then, label %if.else // + + // + + // + + // if.then: if.else: // %lt = load %addr_l %le = load %addr_l // // <...> <...> // store %st, %addr_s store %se, %addr_s // br label %if.end br label %if.end // + + // + + // + + // if.end ("footer"): // <...> // // Diamond shaped code after merge: // // header: // %l = load %addr_l // br %cond, label %if.then, label %if.else // + + // + + // + + // if.then: if.else: // // <...> <...> // br label %if.end br label %if.end // + + // + + // + + // if.end ("footer"): // %s.sink = phi [%st, if.then], [%se, if.else] // <...> // store %s.sink, %addr_s // <...> // // //===----------------------- TODO -----------------------------------------===// // // 1) Generalize to regions other than diamonds // 2) Be more aggressive merging memory operations // Note that both changes require register pressure control // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include using namespace llvm; #define DEBUG_TYPE "mldst-motion" //===----------------------------------------------------------------------===// // MergedLoadStoreMotion Pass //===----------------------------------------------------------------------===// namespace { class MergedLoadStoreMotion : public FunctionPass { AliasAnalysis *AA; MemoryDependenceAnalysis *MD; public: static char ID; // Pass identification, replacement for typeid explicit MergedLoadStoreMotion(void) : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); } // Helper routines /// /// \brief Remove instruction from parent and update memory dependence /// analysis. /// void removeInstruction(Instruction *Inst); BasicBlock *getDiamondTail(BasicBlock *BB); bool isDiamondHead(BasicBlock *BB); // Routines for hoisting loads bool isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI); LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, Instruction *ElseInst); bool isSafeToHoist(Instruction *I) const; bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst); bool mergeLoads(BasicBlock *BB); // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); bool isStoreSinkBarrierInRange(const Instruction& Start, const Instruction& End, AliasAnalysis::Location Loc); bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, // where Size0 and Size1 are the #instructions on the two sides of // the diamond. The constant chosen here is arbitrary. Compiler Time // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. const int MagicCompileTimeControl; }; char MergedLoadStoreMotion::ID = 0; } /// /// \brief createMergedLoadStoreMotionPass - The public interface to this file. /// FunctionPass *llvm::createMergedLoadStoreMotionPass() { return new MergedLoadStoreMotion(); } INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) /// /// \brief Remove instruction from parent and update memory dependence analysis. /// void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { // Notify the memory dependence analysis. if (MD) { MD->removeInstruction(Inst); if (LoadInst *LI = dyn_cast(Inst)) MD->invalidateCachedPointerInfo(LI->getPointerOperand()); if (Inst->getType()->getScalarType()->isPointerTy()) { MD->invalidateCachedPointerInfo(Inst); } } Inst->eraseFromParent(); } /// /// \brief Return tail block of a diamond. /// BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); BranchInst *BI = (BranchInst *)(BB->getTerminator()); BasicBlock *Succ0 = BI->getSuccessor(0); BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); return Tail; } /// /// \brief True when BB is the head of a diamond (hammock) /// bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { if (!BB) return false; if (!isa(BB->getTerminator())) return false; if (BB->getTerminator()->getNumSuccessors() != 2) return false; BranchInst *BI = (BranchInst *)(BB->getTerminator()); BasicBlock *Succ0 = BI->getSuccessor(0); BasicBlock *Succ1 = BI->getSuccessor(1); if (!Succ0->getSinglePredecessor() || Succ0->getTerminator()->getNumSuccessors() != 1) return false; if (!Succ1->getSinglePredecessor() || Succ1->getTerminator()->getNumSuccessors() != 1) return false; BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); // Ignore triangles. if (Succ1->getTerminator()->getSuccessor(0) != Tail) return false; return true; } /// /// \brief True when instruction is a hoist barrier for a load /// /// Whenever an instruction could possibly modify the value /// being loaded or protect against the load from happening /// it is considered a hoist barrier. /// bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI) { AliasAnalysis::Location Loc = AA->getLocation(LI); return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); } /// /// \brief Decide if a load can be hoisted /// /// When there is a load in \p BB to the same address as \p LI /// and it can be hoisted from \p BB, return that load. /// Otherwise return Null. /// LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, LoadInst *Load0) { for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; ++BBI) { Instruction *Inst = BBI; // Only merge and hoist loads when their result in used only in BB if (!isa(Inst) || Inst->isUsedOutsideOfBlock(BB1)) continue; LoadInst *Load1 = dyn_cast(Inst); BasicBlock *BB0 = Load0->getParent(); AliasAnalysis::Location Loc0 = AA->getLocation(Load0); AliasAnalysis::Location Loc1 = AA->getLocation(Load1); if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) && !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) && !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) { return Load1; } } return nullptr; } /// /// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into /// \p BB /// /// BB is the head of a diamond /// void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, Instruction *HoistCand, Instruction *ElseInst) { DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump(); dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n"; dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n"); // Hoist the instruction. assert(HoistCand->getParent() != BB); // Intersect optional metadata. HoistCand->intersectOptionalDataWith(ElseInst); HoistCand->dropUnknownMetadata(); // Prepend point for instruction insert Instruction *HoistPt = BB->getTerminator(); // Merged instruction Instruction *HoistedInst = HoistCand->clone(); // Notify AA of the new value. if (isa(HoistCand)) AA->copyValue(HoistCand, HoistedInst); // Hoist instruction. HoistedInst->insertBefore(HoistPt); HoistCand->replaceAllUsesWith(HoistedInst); removeInstruction(HoistCand); // Replace the else block instruction. ElseInst->replaceAllUsesWith(HoistedInst); removeInstruction(ElseInst); } /// /// \brief Return true if no operand of \p I is defined in I's parent block /// bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { BasicBlock *Parent = I->getParent(); for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Instruction *Instr = dyn_cast(I->getOperand(i)); if (Instr && Instr->getParent() == Parent) return false; } return true; } /// /// \brief Merge two equivalent loads and GEPs and hoist into diamond head /// bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, LoadInst *L1) { // Only one definition? Instruction *A0 = dyn_cast(L0->getPointerOperand()); Instruction *A1 = dyn_cast(L1->getPointerOperand()); if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) && A0->hasOneUse() && (A0->getParent() == L0->getParent()) && A1->hasOneUse() && (A1->getParent() == L1->getParent()) && isa(A0)) { DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump(); dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n"; dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n"); hoistInstruction(BB, A0, A1); hoistInstruction(BB, L0, L1); return true; } else return false; } /// /// \brief Try to hoist two loads to same address into diamond header /// /// Starting from a diamond head block, iterate over the instructions in one /// successor block and try to match a load in the second successor. /// bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { bool MergedLoads = false; assert(isDiamondHead(BB)); BranchInst *BI = dyn_cast(BB->getTerminator()); BasicBlock *Succ0 = BI->getSuccessor(0); BasicBlock *Succ1 = BI->getSuccessor(1); // #Instructions in Succ1 for Compile Time Control int Size1 = Succ1->size(); int NLoads = 0; for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); BBI != BBE;) { Instruction *I = BBI; ++BBI; // Only move non-simple (atomic, volatile) loads. LoadInst *L0 = dyn_cast(I); if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0)) continue; ++NLoads; if (NLoads * Size1 >= MagicCompileTimeControl) break; if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) { bool Res = hoistLoad(BB, L0, L1); MergedLoads |= Res; // Don't attempt to hoist above loads that had not been hoisted. if (!Res) break; } } return MergedLoads; } /// /// \brief True when instruction is a sink barrier for a store /// located in Loc /// /// Whenever an instruction could possibly read or modify the /// value being stored or protect against the store from /// happening it is considered a sink barrier. /// bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start, const Instruction& End, AliasAnalysis::Location Loc) { - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Ref); + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); } /// /// \brief Check if \p BB contains a store to the same address as \p SI /// /// \return The store in \p when it is safe to sink. Otherwise return Null. /// StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, StoreInst *Store0) { DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); + BasicBlock *BB0 = Store0->getParent(); for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend(); RBI != RBE; ++RBI) { Instruction *Inst = &*RBI; if (!isa(Inst)) continue; StoreInst *Store1 = cast(Inst); - BasicBlock *BB0 = Store0->getParent(); AliasAnalysis::Location Loc0 = AA->getLocation(Store0); AliasAnalysis::Location Loc1 = AA->getLocation(Store1); if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) && - !isStoreSinkBarrierInRange(*Store1, BB1->back(), Loc1) && - !isStoreSinkBarrierInRange(*Store0, BB0->back(), Loc0)) { + !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))), + BB1->back(), Loc1) && + !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))), + BB0->back(), Loc0)) { return Store1; } } return nullptr; } /// /// \brief Create a PHI node in BB for the operands of S0 and S1 /// PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Create a phi if the values mismatch. PHINode *NewPN = 0; Value *Opd1 = S0->getValueOperand(); Value *Opd2 = S1->getValueOperand(); if (Opd1 != Opd2) { NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", BB->begin()); NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); if (NewPN->getType()->getScalarType()->isPointerTy()) { // Notify AA of the new value. AA->copyValue(Opd1, NewPN); AA->copyValue(Opd2, NewPN); // AA needs to be informed when a PHI-use of the pointer value is added for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { unsigned J = PHINode::getOperandNumForIncomingValue(I); AA->addEscapingUse(NewPN->getOperandUse(J)); } if (MD) MD->invalidateCachedPointerInfo(NewPN); } } return NewPN; } /// /// \brief Merge two stores to same address and sink into \p BB /// /// Also sinks GEP instruction computing the store address /// bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Only one definition? Instruction *A0 = dyn_cast(S0->getPointerOperand()); Instruction *A1 = dyn_cast(S1->getPointerOperand()); if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && (A0->getParent() == S0->getParent()) && A1->hasOneUse() && (A1->getParent() == S1->getParent()) && isa(A0)) { DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); // Hoist the instruction. BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); // Intersect optional metadata. S0->intersectOptionalDataWith(S1); S0->dropUnknownMetadata(); // Create the new store to be inserted at the join point. StoreInst *SNew = (StoreInst *)(S0->clone()); Instruction *ANew = A0->clone(); AA->copyValue(S0, SNew); SNew->insertBefore(InsertPt); ANew->insertBefore(SNew); assert(S0->getParent() == A0->getParent()); assert(S1->getParent() == A1->getParent()); PHINode *NewPN = getPHIOperand(BB, S0, S1); // New PHI operand? Use it. if (NewPN) SNew->setOperand(0, NewPN); removeInstruction(S0); removeInstruction(S1); A0->replaceAllUsesWith(ANew); removeInstruction(A0); A1->replaceAllUsesWith(ANew); removeInstruction(A1); return true; } return false; } /// /// \brief True when two stores are equivalent and can sink into the footer /// /// Starting from a diamond tail block, iterate over the instructions in one /// predecessor block and try to match a store in the second predecessor. /// bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { bool MergedStores = false; assert(T && "Footer of a diamond cannot be empty"); pred_iterator PI = pred_begin(T), E = pred_end(T); assert(PI != E); BasicBlock *Pred0 = *PI; ++PI; BasicBlock *Pred1 = *PI; ++PI; // tail block of a diamond/hammock? if (Pred0 == Pred1) return false; // No. if (PI != E) return false; // No. More than 2 predecessors. // #Instructions in Succ1 for Compile Time Control int Size1 = Pred1->size(); int NStores = 0; for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend(); RBI != RBE;) { Instruction *I = &*RBI; ++RBI; // Sink move non-simple (atomic, volatile) stores if (!isa(I)) continue; StoreInst *S0 = (StoreInst *)I; if (!S0->isSimple()) continue; ++NStores; if (NStores * Size1 >= MagicCompileTimeControl) break; if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { bool Res = sinkStore(T, S0, S1); MergedStores |= Res; // Don't attempt to sink below stores that had to stick around // But after removal of a store and some of its feeding // instruction search again from the beginning since the iterator // is likely stale at this point. if (!Res) break; else { RBI = Pred0->rbegin(); RBE = Pred0->rend(); DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); } } } return MergedStores; } /// /// \brief Run the transformation for each function /// bool MergedLoadStoreMotion::runOnFunction(Function &F) { MD = &getAnalysis(); AA = &getAnalysis(); bool Changed = false; DEBUG(dbgs() << "Instruction Merger\n"); // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { BasicBlock *BB = FI++; // Hoist equivalent loads and sink stores // outside diamonds when possible if (isDiamondHead(BB)) { Changed |= mergeLoads(BB); Changed |= mergeStores(getDiamondTail(BB)); } } return Changed; } Index: projects/clang360-import/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp (revision 279025) @@ -1,398 +1,414 @@ //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements some loop unrolling utilities for loops with run-time // trip counts. See LoopUnroll.cpp for unrolling loops with compile-time // trip counts. // // The functions in this file are used to generate extra code when the // run-time trip count modulo the unroll factor is not 0. When this is the // case, we need to generate code to execute these 'left over' iterations. // // The current strategy generates an if-then-else sequence prior to the // unrolled loop to execute the 'left over' iterations. Other strategies // include generate a loop before or after the unrolled loop. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include using namespace llvm; #define DEBUG_TYPE "loop-unroll" STATISTIC(NumRuntimeUnrolled, "Number of loops unrolled with run-time trip counts"); /// Connect the unrolling prolog code to the original loop. /// The unrolling prolog code contains code to execute the /// 'extra' iterations if the run-time trip count modulo the /// unroll count is non-zero. /// /// This function performs the following: /// - Create PHI nodes at prolog end block to combine values /// that exit the prolog code and jump around the prolog. /// - Add a PHI operand to a PHI node at the loop exit block /// for values that exit the prolog and go around the loop. /// - Branch around the original loop if the trip count is less /// than the unroll factor. /// -static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, +static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, ValueToValueMapTy &VMap, Pass *P) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); // Create a PHI node for each outgoing value from the original loop // (which means it is an outgoing value from the prolog code too). // The new PHI node is inserted in the prolog end basic block. // The new PHI name is added as an operand of a PHI node in either // the loop header or the loop exit block. for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch); SBI != SBE; ++SBI) { for (BasicBlock::iterator BBI = (*SBI)->begin(); PHINode *PN = dyn_cast(BBI); ++BBI) { // Add a new PHI node to the prolog end block and add the // appropriate incoming values. PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr", PrologEnd->getTerminator()); // Adding a value to the new PHI node from the original loop preheader. // This is the value that skips all the prolog code. if (L->contains(PN)) { NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH); } else { NewPN->addIncoming(Constant::getNullValue(PN->getType()), OrigPH); } Value *V = PN->getIncomingValueForBlock(Latch); if (Instruction *I = dyn_cast(V)) { if (L->contains(I)) { V = VMap[I]; } } // Adding a value to the new PHI node from the last prolog block // that was created. NewPN->addIncoming(V, LastPrologBB); // Update the existing PHI node operand with the value from the // new PHI node. How this is done depends on if the existing // PHI node is in the original loop block, or the exit block. if (L->contains(PN)) { PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN); } else { PN->addIncoming(NewPN, PrologEnd); } } } - // Create a branch around the orignal loop, which is taken if the - // trip count is less than the unroll factor. + // Create a branch around the orignal loop, which is taken if there are no + // iterations remaining to be executed after running the prologue. Instruction *InsertPt = PrologEnd->getTerminator(); + + assert(Count != 0 && "nonsensical Count!"); + + // If BECount getType(), Count)); + new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, BECount, + ConstantInt::get(BECount->getType(), Count - 1)); BasicBlock *Exit = L->getUniqueExitBlock(); assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees SmallVector Preds(pred_begin(Exit), pred_end(Exit)); if (!Exit->isLandingPad()) { SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P); } else { SmallVector NewBBs; SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa", P, NewBBs); } // Add the branch to the exit block (around the unrolled loop) BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt); InsertPt->eraseFromParent(); } /// Create a clone of the blocks in a loop and connect them together. /// If UnrollProlog is true, loop structure will not be cloned, otherwise a new /// loop will be created including all cloned blocks, and the iterator of it /// switches to count NewIter down to 0. /// static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, BasicBlock *InsertTop, BasicBlock *InsertBot, std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, LoopInfo *LI) { BasicBlock *Preheader = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); Function *F = Header->getParent(); LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); Loop *NewLoop = 0; Loop *ParentLoop = L->getParentLoop(); if (!UnrollProlog) { NewLoop = new Loop(); if (ParentLoop) ParentLoop->addChildLoop(NewLoop); else LI->addTopLevelLoop(NewLoop); } // For each block in the original loop, create a new copy, // and update the value map with the newly created values. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F); NewBlocks.push_back(NewBB); if (NewLoop) NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); else if (ParentLoop) ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); VMap[*BB] = NewBB; if (Header == *BB) { // For the first block, add a CFG connection to this newly // created block. InsertTop->getTerminator()->setSuccessor(0, NewBB); } if (Latch == *BB) { // For the last block, if UnrollProlog is true, create a direct jump to // InsertBot. If not, create a loop back to cloned head. VMap.erase((*BB)->getTerminator()); BasicBlock *FirstLoopBB = cast(VMap[Header]); BranchInst *LatchBR = cast(NewBB->getTerminator()); if (UnrollProlog) { LatchBR->eraseFromParent(); BranchInst::Create(InsertBot, NewBB); } else { PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter", FirstLoopBB->getFirstNonPHI()); IRBuilder<> Builder(LatchBR); Value *IdxSub = Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), NewIdx->getName() + ".sub"); Value *IdxCmp = Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp"); BranchInst::Create(FirstLoopBB, InsertBot, IdxCmp, NewBB); NewIdx->addIncoming(NewIter, InsertTop); NewIdx->addIncoming(IdxSub, NewBB); LatchBR->eraseFromParent(); } } } // Change the incoming values to the ones defined in the preheader or // cloned loop. for (BasicBlock::iterator I = Header->begin(); isa(I); ++I) { PHINode *NewPHI = cast(VMap[I]); if (UnrollProlog) { VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); cast(VMap[Header])->getInstList().erase(NewPHI); } else { unsigned idx = NewPHI->getBasicBlockIndex(Preheader); NewPHI->setIncomingBlock(idx, InsertTop); BasicBlock *NewLatch = cast(VMap[Latch]); idx = NewPHI->getBasicBlockIndex(Latch); Value *InVal = NewPHI->getIncomingValue(idx); NewPHI->setIncomingBlock(idx, NewLatch); if (VMap[InVal]) NewPHI->setIncomingValue(idx, VMap[InVal]); } } if (NewLoop) { // Add unroll disable metadata to disable future unrolling for this loop. SmallVector MDs; // Reserve first location for self reference to the LoopID metadata node. MDs.push_back(nullptr); MDNode *LoopID = NewLoop->getLoopID(); if (LoopID) { // First remove any existing loop unrolling metadata. for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { bool IsUnrollMetadata = false; MDNode *MD = dyn_cast(LoopID->getOperand(i)); if (MD) { const MDString *S = dyn_cast(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); } if (!IsUnrollMetadata) MDs.push_back(LoopID->getOperand(i)); } } LLVMContext &Context = NewLoop->getHeader()->getContext(); SmallVector DisableOperands; DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); MDNode *DisableNode = MDNode::get(Context, DisableOperands); MDs.push_back(DisableNode); MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); NewLoop->setLoopID(NewLoopID); } } /// Insert code in the prolog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number /// of loop bodes in the loop after unrolling. (Some folks refer /// to the unroll factor as the number of *extra* copies added). /// We assume also that the loop unroll factor is a power-of-two. So, after /// unrolling the loop, the number of loop bodies executed is 2, /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: /// else jump Prol /// Prol: LoopBody; /// extraiters -= 1 // Omitted if unroll factor is 2. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. /// if (tripcount < loopfactor) jump End /// Loop: /// ... /// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, LPPassManager *LPM) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) return false; // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification if (!LPM) return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable(); if (!SE) return false; // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) - const SCEV *BECount = SE->getBackedgeTakenCount(L); - if (isa(BECount) || !BECount->getType()->isIntegerTy()) + const SCEV *BECountSC = SE->getBackedgeTakenCount(L); + if (isa(BECountSC) || + !BECountSC->getType()->isIntegerTy()) return false; - // If BECount is INT_MAX, we can't compute trip-count without overflow. - if (BECount->isAllOnesValue()) - return false; + unsigned BEWidth = cast(BECountSC->getType())->getBitWidth(); // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = - SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa(TripCountSC)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. - if ((Count & (Count-1)) != 0) + if (!isPowerOf2_32(Count)) return false; + // This constraint lets us deal with an overflowing trip count easily; see the + // comment on ModVal below. This check is equivalent to `Log2(Count) < + // BEWidth`. + if (static_cast(Count) > (1ULL << BEWidth)) + return false; + // If this loop is nested, then the loop unroller changes the code in // parent loop, so the Scalar Evolution pass needs to be run again if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); BranchInst *PreHeaderBR = cast(PH->getTerminator()); // Compute the number of extra iterations required, which is: // extra iterations = run-time trip count % (loop unroll factor + 1) SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); + Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), + PreHeaderBR); IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); - // Check if for no extra iterations, then jump to cloned/unrolled loop. - // We have to check that the trip count computation didn't overflow when - // adding one to the backedge taken count. - Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod"); - Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow"); - Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or"); + // If ModVal is zero, we know that either + // 1. there are no iteration to be run in the prologue loop + // OR + // 2. the addition computing TripCount overflowed + // + // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the + // number of iterations that remain to be run in the original loop is a + // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we + // explicitly check this above). + Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); + // Branch to either the extra iterations or the cloned/unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); Function *F = Header->getParent(); // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); // // For each extra loop iteration, create a copy of the loop's basic blocks // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // std::vector NewBlocks; ValueToValueMapTy VMap; - // If unroll count is 2 and we can't overflow in tripcount computation (which - // is BECount + 1), then we don't need a loop for prologue, and we can unroll - // it. We can be sure that we don't overflow only if tripcount is a constant. - bool UnrollPrologue = (Count == 2 && isa(TripCount)); + bool UnrollPrologue = Count == 2; // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks, VMap, LI); // Insert the cloned blocks into function just before the original loop F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], F->end()); // Rewrite the cloned instruction operands to use the values // created when the clone is created. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { RemapInstruction(I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } // Connect the prolog code to the original loop and update the // PHI functions. BasicBlock *LastLoopBB = cast(VMap[Latch]); - ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, + ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, LPM->getAsPass()); NumRuntimeUnrolled++; return true; } Index: projects/clang360-import/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- projects/clang360-import/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 279025) @@ -1,6318 +1,6320 @@ //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops // and generates target-independent LLVM-IR. // The vectorizer uses the TargetTransformInfo analysis to estimate the costs // of instructions in order to estimate the profitability of vectorization. // // The loop vectorizer combines consecutive loop iterations into a single // 'wide' iteration. After this transformation the index is incremented // by the SIMD vector width, and not by one. // // This pass has three parts: // 1. The main loop pass that drives the different parts. // 2. LoopVectorizationLegality - A unit that checks for the legality // of the vectorization. // 3. InnerLoopVectorizer - A unit that performs the actual // widening of instructions. // 4. LoopVectorizationCostModel - A unit that checks for the profitability // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. // //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. // // Variable uniformity checks are inspired by: // Karrenberg, R. and Hack, S. Whole Function Vectorization. // // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of // Vectorizing Compilers. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/VectorUtils.h" #include #include #include using namespace llvm; using namespace llvm::PatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect.")); static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); /// We don't vectorize loops with a known constant trip count below this number. static cl::opt TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Don't vectorize loops with a constant " "trip count that is smaller than this " "value.")); /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) /// A[i * Stride1] += B[i * Stride2] ... /// /// Will be roughly translated to /// if (Stride1 == 1 && Stride2 == 1) { /// for (i = 0; i < N; i+=4) /// A[i:i+3] += ... /// } else /// ... static cl::opt EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning")); /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; /// When performing memory disambiguation checks at runtime do not make more /// than this number of comparisons. static const unsigned RuntimeMemoryCheckThreshold = 8; /// Maximum simd width. static const unsigned MaxVectorWidth = 64; static cl::opt ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); static cl::opt ForceTargetNumVectorRegs( "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; static cl::opt ForceTargetMaxScalarInterleaveFactor( "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops.")); static cl::opt ForceTargetMaxVectorInterleaveFactor( "force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops.")); static cl::opt ForceTargetInstructionCost( "force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing.")); static cl::opt SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the unroller.")); static cl::opt LoopVectorizeWithBlockFrequency( "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); // Runtime unroll loops for load/store throughput. static cl::opt EnableLoadStoreRuntimeUnroll( "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, cl::desc("Enable runtime unrolling until load/store ports are saturated")); /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if.")); static cl::opt EnableIndVarRegisterHeur( "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when unrolling")); static cl::opt EnableCondStoresVectorization( "enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); static cl::opt MaxNestedScalarReductionUF( "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, cl::desc("The maximum unroll factor to use when unrolling a scalar " "reduction in a nested loop.")); namespace { // Forward declarations. class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizeHints; /// Optimization analysis message produced during vectorization. Messages inform /// the user why vectorization did not occur. class Report { std::string Message; raw_string_ostream Out; Instruction *Instr; public: Report(Instruction *I = nullptr) : Out(Message), Instr(I) { Out << "loop not vectorized: "; } template Report &operator<<(const A &Value) { Out << Value; return *this; } Instruction *getInstr() { return Instr; } std::string &str() { return Out.str(); } operator Twine() { return Out.str(); } }; /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple /// scalars. This class also implements the following features: /// * It inserts an epilogue loop for handling loops that don't have iteration /// counts that are known to be a multiple of the vectorization factor. /// * It handles the code generation for reduction variables. /// * Scalarization (implementation using scalars) of un-vectorizable /// instructions. /// InnerLoopVectorizer does not perform any vectorization-legality /// checks, and relies on the caller to check for the different legality /// aspects. The InnerLoopVectorizer relies on the /// LoopVectorizationLegality class to provide information about the induction /// and reduction variables that were found to a given vectorization factor. class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const DataLayout *DL, const TargetLibraryInfo *TLI, unsigned VecWidth, unsigned UnrollFactor) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), Legal(nullptr) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); // Register the new loop and update the analysis passes. updateAnalysis(); } virtual ~InnerLoopVectorizer() {} protected: /// A small list of PHINodes. typedef SmallVector PhiVector; /// When we unroll loops we have multiple vector values for each scalar. /// This data structure holds the unrolled and vectorized values that /// originated from one scalar instruction. typedef SmallVector VectorParts; // When we if-convert we need create edge masks. We have to cache values so // that we don't end up with exponential recursion/IR. typedef DenseMap, VectorParts> EdgeMaskCache; /// \brief Add code that checks at runtime if the accessed arrays overlap. /// /// Returns a pair of instructions where the first element is the first /// instruction generated in possibly a sequence of instructions and the /// second value is the final comparator value or NULL if no check is needed. std::pair addRuntimeCheck(Instruction *Loc); /// \brief Add checks for strides that where assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the /// pair as (first, last). std::pair addStrideCheck(Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. /// See PR14725. void fixLCSSAPHIs(); /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. VectorParts createBlockInMask(BasicBlock *BB); /// A helper function that computes the predicate of the edge between SRC /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. void widenPHIInstruction(Instruction *PN, VectorParts &Entry, unsigned UF, unsigned VF, PhiVector *PV); /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. If \p IfPredicateStore is true we need to 'hide' each /// scalarized instruction behind an if block predicated on the control /// dependence of the instruction. virtual void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore=false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... /// this is needed because each iteration in the loop corresponds to a SIMD /// element. virtual Value *getBroadcastInstrs(Value *V); /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. /// When we widen (vectorize) values we place them in the map. If the values /// are not within the map, they have to be loop invariant, so we simply /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts /// are stored in the VectorPart type. struct ValueMap { /// C'tor. UnrollFactor controls the number of vectors ('parts') that /// are mapped. ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} /// \return True if 'Key' is saved in the Value Map. bool has(Value *Key) const { return MapStorage.count(Key); } /// Initializes a new entry in the map. Sets all of the vector parts to the /// save value in 'Val'. /// \return A reference to a vector with splat values. VectorParts &splat(Value *Key, Value *Val) { VectorParts &Entry = MapStorage[Key]; Entry.assign(UF, Val); return Entry; } ///\return A reference to the value that is stored at 'Key'. VectorParts &get(Value *Key) { VectorParts &Entry = MapStorage[Key]; if (Entry.empty()) Entry.resize(UF); assert(Entry.size() == UF); return Entry; } private: /// The unroll factor. Each entry in the map stores this number of vector /// elements. unsigned UF; /// Map storage. We use std::map and not DenseMap because insertions to a /// dense map invalidates its iterators. std::map MapStorage; }; /// The original loop. Loop *OrigLoop; /// Scev analysis to use. ScalarEvolution *SE; /// Loop Info. LoopInfo *LI; /// Dominator Tree. DominatorTree *DT; /// Alias Analysis. AliasAnalysis *AA; /// Data Layout. const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; /// The builder that we use IRBuilder<> Builder; // --- Vectorization state --- /// The vector-loop preheader. BasicBlock *LoopVectorPreHeader; /// The scalar-loop preheader. BasicBlock *LoopScalarPreHeader; /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; ///The ExitBlock of the scalar loop. BasicBlock *LoopExitBlock; ///The vector loop body. SmallVector LoopVectorBody; ///The scalar loop body. BasicBlock *LoopScalarBody; /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; /// The new Induction variable which was added to the new block. PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; /// Holds the extended (to the widest induction type) start index. Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; EdgeMaskCache MaskCache; LoopVectorizationLegality *Legal; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const DataLayout *DL, const TargetLibraryInfo *TLI, unsigned UnrollFactor) : InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } private: void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; Value *reverseVector(Value *Vec) override; }; /// \brief Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { if (!I) return I; DebugLoc Empty; if (I->getDebugLoc() != Empty) return I; for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { if (Instruction *OpInst = dyn_cast(*OI)) if (OpInst->getDebugLoc() != Empty) return OpInst; } return I; } /// \brief Set the debug location in the builder using the debug location in the /// instruction. static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { if (const Instruction *Inst = dyn_cast_or_null(Ptr)) B.SetCurrentDebugLocation(Inst->getDebugLoc()); else B.SetCurrentDebugLocation(DebugLoc()); } #ifndef NDEBUG /// \return string containing a file name and a line # for the given loop. static std::string getDebugLocString(const Loop *L) { std::string Result; if (L) { raw_string_ostream OS(Result); const DebugLoc LoopDbgLoc = L->getStartLoc(); if (!LoopDbgLoc.isUnknown()) LoopDbgLoc.print(L->getHeader()->getContext(), OS); else // Just print the module name. OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); OS.flush(); } return Result; } #endif /// \brief Propagate known metadata from one instruction to another. static void propagateMetadata(Instruction *To, const Instruction *From) { SmallVector, 4> Metadata; From->getAllMetadataOtherThanDebugLoc(Metadata); for (auto M : Metadata) { unsigned Kind = M.first; // These are safe to transfer (this is safe for TBAA, even when we // if-convert, because should that metadata have had a control dependency // on the condition, and thus actually aliased with some other // non-speculated memory access when the condition was false, this would be // caught by the runtime overlap checks). if (Kind != LLVMContext::MD_tbaa && Kind != LLVMContext::MD_alias_scope && Kind != LLVMContext::MD_noalias && Kind != LLVMContext::MD_fpmath) continue; To->setMetadata(Kind, M.second); } } /// \brief Propagate known metadata from one instruction to a vector of others. static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast(V)) propagateMetadata(I, From); } /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the /// legality. This class has two main kinds of checks: /// * Memory checks - The code in canVectorizeMemory checks if vectorization /// will change the order of memory accesses in a way that will change the /// correctness of the program. /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory /// checks for a number of different conditions, such as the availability of a /// single induction variable, that all types are supported and vectorize-able, /// etc. This code reflects the capabilities of InnerLoopVectorizer. /// This class is also used by InnerLoopVectorizer for identifying /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: unsigned NumLoads; unsigned NumStores; unsigned NumPredStores; LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { } /// This enum represents the kinds of reductions that we support. enum ReductionKind { RK_NoReduction, ///< Not a reduction. RK_IntegerAdd, ///< Sum of integers. RK_IntegerMult, ///< Product of integers. RK_IntegerOr, ///< Bitwise or logical OR of numbers. RK_IntegerAnd, ///< Bitwise or logical AND of numbers. RK_IntegerXor, ///< Bitwise or logical XOR of numbers. RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. RK_FloatMult, ///< Product of floats. RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). }; /// This enum represents the kinds of inductions that we support. enum InductionKind { IK_NoInduction, ///< Not an induction variable. IK_IntInduction, ///< Integer induction variable. Step = 1. IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). }; // This enum represents the kind of minmax reduction. enum MinMaxReductionKind { MRK_Invalid, MRK_UIntMin, MRK_UIntMax, MRK_SIntMin, MRK_SIntMax, MRK_FloatMin, MRK_FloatMax }; /// This struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, MinMaxReductionKind MK) : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} // The starting value of the reduction. // It does not have to be zero! TrackingVH StartValue; // The instruction who's value is used outside the loop. Instruction *LoopExitInstr; // The kind of the reduction. ReductionKind Kind; // If this a min/max reduction the kind of reduction. MinMaxReductionKind MinMaxKind; }; /// This POD struct holds information about a potential reduction operation. struct ReductionInstDesc { ReductionInstDesc(bool IsRedux, Instruction *I) : IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} // Is this instruction a reduction candidate. bool IsReduction; // The last instruction in a min/max pattern (select of the select(icmp()) // pattern), or the current reduction instruction otherwise. Instruction *PatternLastInst; // If this is a min/max pattern the comparison predicate. MinMaxReductionKind MinMaxKind; }; /// This struct holds information about the memory runtime legality /// check that a group of pointers do not overlap. struct RuntimePointerCheck { RuntimePointerCheck() : Need(false) {} /// Reset the state of the pointer runtime information. void reset() { Need = false; Pointers.clear(); Starts.clear(); Ends.clear(); IsWritePtr.clear(); DependencySetId.clear(); AliasSetId.clear(); } /// Insert a pointer and calculate the start and end SCEVs. void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); /// This flag indicates if we need to add the runtime check. bool Need; /// Holds the pointers that we need to check. SmallVector, 2> Pointers; /// Holds the pointer value at the beginning of the loop. SmallVector Starts; /// Holds the pointer value at the end of the loop. SmallVector Ends; /// Holds the information if this pointer is used for writing to memory. SmallVector IsWritePtr; /// Holds the id of the set of pointers that could be dependent because of a /// shared underlying object. SmallVector DependencySetId; /// Holds the id of the disjoint alias set to which this pointer belongs. SmallVector AliasSetId; }; /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH StartValue; /// Induction kind. InductionKind IK; }; /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; /// InductionList saves induction variables and maps them to the /// induction descriptor. typedef MapVector InductionList; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. bool canVectorize(); /// Returns the Induction variable. PHINode *getInduction() { return Induction; } /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); /// Check if this pointer is consecutive when vectorizing. This happens /// when the last index of the GEP is the induction variable, or that the /// pointer itself is an induction variable. /// This check allows us to vectorize A[idx] into a wide load/store. /// Returns: /// 0 - Stride is unknown or non-consecutive. /// 1 - Address is consecutive. /// -1 - Address is consecutive, and decreasing. int isConsecutivePtr(Value *Ptr); /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); /// Returns true if this instruction will remain scalar after vectorization. bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } SmallPtrSet::iterator strides_begin() { return StrideSet.begin(); } SmallPtrSet::iterator strides_end() { return StrideSet.end(); } /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr) { return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); } /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction* I) { return (MaskedOp.count(I) != 0); } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count /// and we only need to check individual instructions. bool canVectorizeInstrs(); /// When we vectorize loops we may change the order in which /// we read and write from memory. This method checks if it is /// legal to vectorize the code, considering only memory constrains. /// Returns true if the loop is vectorizable bool canVectorizeMemory(); /// Return true if we can vectorize this loop using the IF-conversion /// transformation. bool canVectorizeWithIfConvert(); /// Collect the variables that need to stay uniform after vectorization. void collectLoopUniforms(); /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); /// Returns a struct describing if the instruction 'I' can be a reduction /// variable of type 'Kind'. If the reduction is a min/max pattern of /// select(icmp()) this function advances the instruction pointer 'I' from the /// compare instruction to the select instruction and stores this pointer in /// 'PatternLastInst' member of the returned struct. ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, ReductionInstDesc &Desc); /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction /// pattern corresponding to a min(X, Y) or max(X, Y). static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev); /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. void collectStridedAccess(Value *LoadOrStoreInst); /// Report an analysis message to assist the user in diagnosing loops that are /// not vectorized. void emitAnalysis(Report &Message) { DebugLoc DL = TheLoop->getStartLoc(); if (Instruction *I = Message.getInstr()) DL = I->getDebugLoc(); emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, *TheFunction, DL, Message.str()); } /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; /// DataLayout analysis. const DataLayout *DL; /// Dominators. DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; /// Alias analysis. AliasAnalysis *AA; /// Parent function Function *TheFunction; /// Target Transform Info const TargetTransformInfo *TTI; // --- vectorization state --- // /// Holds the integer induction variable. This is the counter of the /// loop. PHINode *Induction; /// Holds the reduction variables. ReductionList Reductions; /// Holds all of the induction variables that we found in the loop. /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; /// Holds the widest induction type encountered. Type *WidestIndTy; /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet Uniforms; /// We need to check that all of the pointers in this list are disjoint /// at runtime. RuntimePointerCheck PtrRtCheck; /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; unsigned MaxSafeDepDistBytes; ValueToValueMap Strides; SmallPtrSet StrideSet; /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of /// a number of reasons. In this class we mainly attempt to predict the /// expected speedup/slowdowns due to the supported instruction set. We use the /// TargetTransformInfo to query the different backends for the cost of /// different operations. class LoopVectorizationCostModel { public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const DataLayout *DL, const TargetLibraryInfo *TLI, AssumptionCache *AC, const Function *F, const LoopVectorizeHints *Hints) : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), TheFunction(F), Hints(Hints) { CodeMetrics::collectEphemeralValues(L, AC, EphValues); } /// Information about vectorization costs struct VectorizationFactor { unsigned Width; // Vector width with best cost unsigned Cost; // Cost of the loop with that width }; /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to VF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. unsigned getWidestType(); /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor /// based on register pressure and other parameters. /// VF and LoopCost are the selected vectorization factor and the cost of the /// selected VF. unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost); /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. unsigned LoopInvariantRegs; /// Holds the maximum number of concurrent live intervals in the loop. unsigned MaxLocalUsers; /// Holds the number of instructions in the loop. unsigned NumInstructions; }; /// \return information about the register usage of the loop. RegisterUsage calculateRegisterUsage(); private: /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. unsigned expectedCost(unsigned VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. static Type* ToVectorTy(Type *Scalar, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); /// Report an analysis message to assist the user in diagnosing loops that are /// not vectorized. void emitAnalysis(Report &Message) { DebugLoc DL = TheLoop->getStartLoc(); if (Instruction *I = Message.getInstr()) DL = I->getDebugLoc(); emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, *TheFunction, DL, Message.str()); } /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; /// Loop Info analysis. LoopInfo *LI; /// Vectorization legality. LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; /// Target data layout information. const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; }; /// Utility class for getting and setting loop vectorizer hints in the form /// of loop metadata. /// This class keeps a number of loop annotations locally (as member variables) /// and can, upon request, write them back as metadata on the loop. It will /// initially scan the loop for existing metadata, and will update the local /// values based on information in the loop. /// We cannot write all values to metadata, as the mere presence of some info, /// for example 'force', means a decision has been made. So, we need to be /// careful NOT to add them if the user hasn't specifically asked so. class LoopVectorizeHints { enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE }; /// Hint - associates name and validation with the hint value. struct Hint { const char * Name; unsigned Value; // This may have to change for non-numeric values. HintKind Kind; Hint(const char * Name, unsigned Value, HintKind Kind) : Name(Name), Value(Value), Kind(Kind) { } bool validate(unsigned Val) { switch (Kind) { case HK_WIDTH: return isPowerOf2_32(Val) && Val <= MaxVectorWidth; case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: return (Val <= 1); } return false; } }; /// Vectorization width. Hint Width; /// Vectorization interleave factor. Hint Interleave; /// Vectorization forced Hint Force; /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } public: enum ForceKind { FK_Undefined = -1, ///< Not selected. FK_Disabled = 0, ///< Forcing disabled. FK_Enabled = 1, ///< Forcing enabled. }; LoopVectorizeHints(const Loop *L, bool DisableInterleaving) : Width("vectorize.width", VectorizationFactor, HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), TheLoop(L) { // Populate values with existing loop metadata. getHintsFromMetadata(); // force-vector-interleave overrides DisableInterleaving. if (VectorizationInterleave.getNumOccurrences() > 0) Interleave.Value = VectorizationInterleave; DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } /// Mark the loop L as already vectorized by setting the width to 1. void setAlreadyVectorized() { Width.Value = Interleave.Value = 1; Hint Hints[] = {Width, Interleave}; writeHintsToMetadata(Hints); } /// Dumps all the hint information. std::string emitRemark() const { Report R; if (Force.Value == LoopVectorizeHints::FK_Disabled) R << "vectorization is explicitly disabled"; else { R << "use -Rpass-analysis=loop-vectorize for more info"; if (Force.Value == LoopVectorizeHints::FK_Enabled) { R << " (Force=true"; if (Width.Value != 0) R << ", Vector Width=" << Width.Value; if (Interleave.Value != 0) R << ", Interleave Count=" << Interleave.Value; R << ")"; } } return R.str(); } unsigned getWidth() const { return Width.Value; } unsigned getInterleave() const { return Interleave.Value; } enum ForceKind getForce() const { return (ForceKind)Force.Value; } private: /// Find hints specified in the loop metadata and update local values. void getHintsFromMetadata() { MDNode *LoopID = TheLoop->getLoopID(); if (!LoopID) return; // First operand should refer to the loop id itself. assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { const MDString *S = nullptr; SmallVector Args; // The expected hint is either a MDString or a MDNode with the first // operand a MDString. if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { if (!MD || MD->getNumOperands() == 0) continue; S = dyn_cast(MD->getOperand(0)); for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) Args.push_back(MD->getOperand(i)); } else { S = dyn_cast(LoopID->getOperand(i)); assert(Args.size() == 0 && "too many arguments for MDString"); } if (!S) continue; // Check if the hint starts with the loop metadata prefix. StringRef Name = S->getString(); if (Args.size() == 1) setHint(Name, Args[0]); } } /// Checks string hint with one operand and set value if valid. void setHint(StringRef Name, Metadata *Arg) { if (!Name.startswith(Prefix())) return; Name = Name.substr(Prefix().size(), StringRef::npos); const ConstantInt *C = mdconst::dyn_extract(Arg); if (!C) return; unsigned Val = C->getZExtValue(); Hint *Hints[] = {&Width, &Interleave, &Force}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) H->Value = Val; else DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); break; } } } /// Create a new hint from name / value pair. MDNode *createHintMetadata(StringRef Name, unsigned V) const { LLVMContext &Context = TheLoop->getHeader()->getContext(); Metadata *MDs[] = {MDString::get(Context, Name), ConstantAsMetadata::get( ConstantInt::get(Type::getInt32Ty(Context), V))}; return MDNode::get(Context, MDs); } /// Matches metadata with hint name. bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { MDString* Name = dyn_cast(Node->getOperand(0)); if (!Name) return false; for (auto H : HintTypes) if (Name->getString().endswith(H.Name)) return true; return false; } /// Sets current hints into loop metadata, keeping other values intact. void writeHintsToMetadata(ArrayRef HintTypes) { if (HintTypes.size() == 0) return; // Reserve the first element to LoopID (see below). SmallVector MDs(1); // If the loop already has metadata, then ignore the existing operands. MDNode *LoopID = TheLoop->getLoopID(); if (LoopID) { for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { MDNode *Node = cast(LoopID->getOperand(i)); // If node in update list, ignore old value. if (!matchesHintMetadataName(Node, HintTypes)) MDs.push_back(Node); } } // Now, add the missing hints. for (auto H : HintTypes) MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); // Replace current metadata node with new one. LLVMContext &Context = TheLoop->getHeader()->getContext(); MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); TheLoop->setLoopID(NewLoopID); } /// The loop these hints belong to. const Loop *TheLoop; }; static void emitMissedWarning(Function *F, Loop *L, const LoopVectorizeHints &LH) { emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), LH.emitRemark()); if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { if (LH.getWidth() != 1) emitLoopVectorizeWarning( F->getContext(), *F, L->getStartLoc(), "failed explicitly specified loop vectorization"); else if (LH.getInterleave() != 1) emitLoopInterleaveWarning( F->getContext(), *F, L->getStartLoc(), "failed explicitly specified loop interleaving"); } } static void addInnerLoop(Loop &L, SmallVectorImpl &V) { if (L.empty()) return V.push_back(&L); for (Loop *InnerL : L) addInnerLoop(*InnerL, V); } /// The LoopVectorize Pass. struct LoopVectorize : public FunctionPass { /// Pass identification, replacement for typeid static char ID; explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) : FunctionPass(ID), DisableUnrolling(NoUnrolling), AlwaysVectorize(AlwaysVectorize) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } ScalarEvolution *SE; const DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; AliasAnalysis *AA; AssumptionCache *AC; bool DisableUnrolling; bool AlwaysVectorize; BlockFrequency ColdEntryFreq; bool runOnFunction(Function &F) override { SE = &getAnalysis(); DataLayoutPass *DLP = getAnalysisIfAvailable(); DL = DLP ? &DLP->getDataLayout() : nullptr; LI = &getAnalysis(); TTI = &getAnalysis(); DT = &getAnalysis().getDomTree(); BFI = &getAnalysis(); TLI = getAnalysisIfAvailable(); AA = &getAnalysis(); AC = &getAnalysis().getAssumptionCache(F); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. const BranchProbability ColdProb(1, 5); // 20% ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(true)) return false; if (!DL) { DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"); return false; } // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. SmallVector Worklist; for (Loop *L : *LI) addInnerLoop(*L, Worklist); LoopsAnalyzed += Worklist.size(); // Now walk the identified inner loops. bool Changed = false; while (!Worklist.empty()) Changed |= processLoop(Worklist.pop_back_val()); // Process each loop nest in the function. return Changed; } bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); #ifndef NDEBUG const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ DEBUG(dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, DisableUnrolling); DEBUG(dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints::FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints::FK_Enabled ? "enabled" : "?")) << " width=" << Hints.getWidth() << " unroll=" << Hints.getInterleave() << "\n"); // Function containing loop Function *F = L->getHeader()->getParent(); // Looking at the diagnostic output is the only way to determine if a loop // was vectorized (other than looking at the IR or machine code), so it // is important to generate an optimization remark for each loop. Most of // these messages are generated by emitOptimizationRemarkAnalysis. Remarks // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), Hints.emitRemark()); return false; } if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), Hints.emitRemark()); return false; } if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); emitOptimizationRemarkAnalysis( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), "loop not vectorized: vector width and interleave count are " "explicitly set to 1"); return false; } // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. const unsigned TC = SE->getSmallConstantTripCount(L); if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); emitOptimizationRemarkAnalysis( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), "vectorization is not beneficial and is not explicitly forced"); return false; } } // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); return false; } // Use the cost model. LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F, &Hints); // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasFnAttribute(Attribute::OptimizeForSize); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we // always have a canonical loop here because we think we *can* vectoriez. // FIXME: This is hidden behind a flag due to pervasive problems with // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && LoopEntryFreq < ColdEntryFreq) OptForSize = true; } // Check the function attributes to see if implicit floats are allowed.a // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); emitOptimizationRemarkAnalysis( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints); return false; } // Select the optimal vectorization factor. const LoopVectorizationCostModel::VectorizationFactor VF = CM.selectVectorizationFactor(OptForSize); // Select the unroll factor. const unsigned UF = CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost); DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); if (UF == 1) { emitOptimizationRemarkAnalysis( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), "not beneficial to vectorize and user disabled interleaving"); return false; } DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); // Report the unrolling decision. emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), Twine("unrolled with interleaving factor " + Twine(UF) + " (vectorization not beneficial)")); // We decided not to vectorize, but we may want to unroll. InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); ++LoopsVectorized; // Report the vectorization decision. emitOptimizationRemark( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + ", unrolling interleave factor: " + Twine(UF) + ")"); } // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); } }; } // end anonymous namespace //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// static Value *stripIntegerCast(Value *V) { if (CastInst *CI = dyn_cast(V)) if (CI->getOperand(0)->getType()->isIntegerTy()) return CI->getOperand(0); return V; } ///\brief Replaces the symbolic stride in a pointer SCEV expression by one. /// /// If \p OrigPtr is not null, use it to look up the stride value instead of /// \p Ptr. static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr = nullptr) { const SCEV *OrigSCEV = SE->getSCEV(Ptr); // If there is an entry in the map return the SCEV of the pointer with the // symbolic stride replaced by one. ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); if (SI != PtrToStride.end()) { Value *StrideVal = SI->second; // Strip casts. StrideVal = stripIntegerCast(StrideVal); // Replace symbolic stride by one. Value *One = ConstantInt::get(StrideVal->getType(), 1); ValueToValueMap RewriteMap; RewriteMap[StrideVal] = One; const SCEV *ByOne = SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne << "\n"); return ByOne; } // Otherwise, just return the SCEV of the original pointer. return SE->getSCEV(Ptr); } void LoopVectorizationLegality::RuntimePointerCheck::insert( ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides) { // Get the stride replaced scev. const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); assert(AR && "Invalid addrec expression"); const SCEV *Ex = SE->getBackedgeTakenCount(Lp); const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); DependencySetId.push_back(DepSetId); AliasSetId.push_back(ASId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast(V); bool NewInstr = (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), Instr->getParent()) != LoopVectorBody.end()); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. IRBuilder<>::InsertPointGuard Guard(Builder); if (Invariant) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); return Shuf; } Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast(Val->getType()); int VLen = Ty->getNumElements(); SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) { int64_t Idx = Negate ? (-i) : i; Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); } // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); return Builder.CreateAdd(Val, Cv, "induction"); } /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. static unsigned getGEPInductionOperand(const DataLayout *DL, const GetElementPtrInst *Gep) { unsigned LastOperand = Gep->getNumOperands() - 1; unsigned GEPAllocSize = DL->getTypeAllocSize( cast(Gep->getType()->getScalarType())->getElementType()); // Walk backwards and try to peel off zeros. while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { // Find the type we're currently indexing into. gep_type_iterator GEPTI = gep_type_begin(Gep); std::advance(GEPTI, LastOperand - 1); // If it's a type with the same allocation size as the result of the GEP we // can peel off the zero index. if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) break; --LastOperand; } return LastOperand; } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to structs. if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null(Ptr); if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; if (IK_PtrInduction == II.IK) return 1; else if (IK_ReversePtrInduction == II.IK) return -1; } GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); if (!Gep) return 0; unsigned NumOperands = Gep->getNumOperands(); Value *GpPtr = Gep->getPointerOperand(); // If this GEP value is a consecutive pointer induction variable and all of // the indices are constant then we know it is consecutive. We can Phi = dyn_cast(GpPtr); if (Phi && Inductions.count(Phi)) { // Make sure that the pointer does not point to structs. PointerType *GepPtrType = cast(GpPtr->getType()); if (GepPtrType->getElementType()->isAggregateType()) return 0; // Make sure that all of the index operands are loop invariant. for (unsigned i = 1; i < NumOperands; ++i) if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; InductionInfo II = Inductions[Phi]; if (IK_PtrInduction == II.IK) return 1; else if (IK_ReversePtrInduction == II.IK) return -1; } unsigned InductionOperand = getGEPInductionOperand(DL, Gep); // Check that all of the gep indices are uniform except for our induction // operand. for (unsigned i = 0; i != NumOperands; ++i) if (i != InductionOperand && !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; // We can emit wide load/stores only if the last non-zero index is the // induction variable. const SCEV *Last = nullptr; if (!Strides.count(Gep)) Last = SE->getSCEV(Gep->getOperand(InductionOperand)); else { // Because of the multiplication by a stride we can have a s/zext cast. // We are going to replace this stride by 1 so the cast is safe to ignore. // // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] // %0 = trunc i64 %indvars.iv to i32 // %mul = mul i32 %0, %Stride1 // %idxprom = zext i32 %mul to i64 << Safe cast. // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom // Last = replaceSymbolicStrideSCEV(SE, Strides, Gep->getOperand(InductionOperand), Gep); if (const SCEVCastExpr *C = dyn_cast(Last)) Last = (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) ? C->getOperand() : Last; } if (const SCEVAddRecExpr *AR = dyn_cast(Last)) { const SCEV *Step = AR->getStepRecurrence(*SE); // The memory is consecutive because the last index is consecutive // and all other indices are loop invariant. if (Step->isOne()) return 1; if (Step->isAllOnesValue()) return -1; } return 0; } bool LoopVectorizationLegality::isUniform(Value *V) { return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); } InnerLoopVectorizer::VectorParts& InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we have a stride that is replaced by one, do it here. if (Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have this scalar in the map, return it. if (WidenMap.has(V)) return WidenMap.get(V); // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); return WidenMap.splat(V, B); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); SmallVector ShuffleMask; for (unsigned i = 0; i < VF; ++i) ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), ConstantVector::get(ShuffleMask), "reverse"); } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); StoreInst *SI = dyn_cast(Instr); assert((LI || SI) && "Invalid Load/Store instruction"); Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. if (!Alignment) Alignment = DL->getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; if (SI && Legal->blockNeedsPredication(SI->getParent()) && !Legal->isMaskRequired(SI)) return scalarizeInstruction(Instr, true); if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); // If the pointer is loop invariant or if it is non-consecutive, // scalarize the load. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; bool UniformLoad = LI && Legal->isUniform(Ptr); if (!ConsecutiveStride || UniformLoad) return scalarizeInstruction(Instr); Constant *Zero = Builder.getInt32(0); VectorParts &Entry = WidenMap.get(Instr); // Handle consecutive loads/stores. GetElementPtrInst *Gep = dyn_cast(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); Gep2->setOperand(0, FirstBasePtr); Gep2->setName("gep.indvar.base"); Ptr = Builder.Insert(Gep2); } else if (Gep) { setDebugLocFromInst(Builder, Gep); assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); unsigned InductionOperand = getGEPInductionOperand(DL, Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); for (unsigned i = 0; i < NumOperands; ++i) { Value *GepOperand = Gep->getOperand(i); Instruction *GepOperandInst = dyn_cast(GepOperand); // Update last index or loop invariant instruction anchored in loop. if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { assert((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && "Must be last index or loop invariant"); VectorParts &GEPParts = getVectorValue(GepOperand); Value *Index = GEPParts[0]; Index = Builder.CreateExtractElement(Index, Zero); Gep2->setOperand(i, Index); Gep2->setName("gep.indvar.idx"); } } Ptr = Builder.Insert(Gep2); } else { // Use the induction element ptr. assert(isa(Ptr) && "Invalid induction ptr"); setDebugLocFromInst(Builder, Ptr); VectorParts &PtrVal = getVectorValue(Ptr); Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: if (SI) { assert(!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses"); setDebugLocFromInst(Builder, SI); // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If we store to reverse consecutive memory locations then we need // to reverse the order of elements in the stored value. StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); } Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); Instruction *NewSI; if (Legal->isMaskRequired(SI)) NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, Mask[Part]); else NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); propagateMetadata(NewSI, SI); } return; } // Handle loads. assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If the address is consecutive but reversed, then the // wide load needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); } Instruction* NewLI; Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); if (Legal->isMaskRequired(LI)) NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], UndefValue::get(DataTy), "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); propagateMetadata(NewLI, LI); Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; setDebugLocFromInst(Builder, Instr); // Find all of the vectorized parameters. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *SrcOp = Instr->getOperand(op); // If we are accessing the old induction variable, use the new one. if (SrcOp == OldInduction) { Params.push_back(getVectorValue(SrcOp)); continue; } // Try using previously calculated values. Instruction *SrcInst = dyn_cast(SrcOp); // If the src is an instruction that appeared earlier in the basic block // then it should already be vectorized. if (SrcInst && OrigLoop->contains(SrcInst)) { assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); // The parameter is a vector value from earlier. Params.push_back(WidenMap.get(SrcInst)); } else { // The parameter is a scalar from outside the loop. Maybe even a constant. VectorParts Scalars; Scalars.append(UF, SrcOp); Params.push_back(Scalars); } } assert(Params.size() == Instr->getNumOperands() && "Invalid number of operands"); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); Instruction *InsertPt = Builder.GetInsertPoint(); BasicBlock *IfBlock = Builder.GetInsertBlock(); BasicBlock *CondBlock = nullptr; VectorParts Cond; Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); VectorLp = LI->getLoopFor(IfBlock); assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: for (unsigned Width = 0; Width < VF; ++Width) { // Start if-block. Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); // Replace the operands of the cloned instructions with extracted scalars. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *Op = Params[op][Part]; // Param is a vector. Need to extract the right lane. if (Op->getType()->isVectorTy()) Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); // End if-block. if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); OldBr->eraseFromParent(); IfBlock = NewIfBlock; } } } } static Instruction *getFirstInst(Instruction *FirstInst, Value *V, Instruction *Loc) { if (FirstInst) return FirstInst; if (Instruction *I = dyn_cast(V)) return I->getParent() == Loc->getParent() ? I : nullptr; return nullptr; } std::pair InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { Instruction *tnullptr = nullptr; if (!Legal->mustCheckStrides()) return std::pair(tnullptr, tnullptr); IRBuilder<> ChkBuilder(Loc); // Emit checks. Value *Check = nullptr; Instruction *FirstInst = nullptr; for (SmallPtrSet::iterator SI = Legal->strides_begin(), SE = Legal->strides_end(); SI != SE; ++SI) { Value *Ptr = stripIntegerCast(*SI); Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), "stride.chk"); // Store the first instruction we create. FirstInst = getFirstInst(FirstInst, C, Loc); if (Check) Check = ChkBuilder.CreateOr(Check, C); else Check = C; } // We have to do this trickery because the IRBuilder might fold the check to a // constant expression in which case there is no Instruction anchored in a // the block. LLVMContext &Ctx = Loc->getContext(); Instruction *TheCheck = BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); ChkBuilder.Insert(TheCheck, "stride.not.one"); FirstInst = getFirstInst(FirstInst, TheCheck, Loc); return std::make_pair(FirstInst, TheCheck); } std::pair InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = Legal->getRuntimePointerCheck(); Instruction *tnullptr = nullptr; if (!PtrRtCheck->Need) return std::pair(tnullptr, tnullptr); unsigned NumPointers = PtrRtCheck->Pointers.size(); SmallVector , 2> Starts; SmallVector , 2> Ends; LLVMContext &Ctx = Loc->getContext(); SCEVExpander Exp(*SE, "induction"); Instruction *FirstInst = nullptr; for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; const SCEV *Sc = SE->getSCEV(Ptr); if (SE->isLoopInvariant(Sc, OrigLoop)) { DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"); Starts.push_back(Ptr); Ends.push_back(Ptr); } else { DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); unsigned AS = Ptr->getType()->getPointerAddressSpace(); // Use this type for pointer arithmetic. Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); Starts.push_back(Start); Ends.push_back(End); } } IRBuilder<> ChkBuilder(Loc); // Our instructions might fold to a constant. Value *MemoryRuntimeCheck = nullptr; for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) continue; // Only need to check pointers between two different dependency sets. if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) continue; // Only need to check pointers in the same alias set. if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) continue; unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && "Trying to bounds check pointers with different address spaces"); Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); FirstInst = getFirstInst(FirstInst, Cmp0, Loc); Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); FirstInst = getFirstInst(FirstInst, Cmp1, Loc); Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); FirstInst = getFirstInst(FirstInst, IsConflict, Loc); if (MemoryRuntimeCheck) { IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); FirstInst = getFirstInst(FirstInst, IsConflict, Loc); } MemoryRuntimeCheck = IsConflict; } } // We have to do this trickery because the IRBuilder might fold the check to a // constant expression in which case there is no Instruction anchored in a // the block. Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx)); ChkBuilder.Insert(Check, "memcheck.conflict"); FirstInst = getFirstInst(FirstInst, Check, Loc); return std::make_pair(FirstInst, Check); } void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. [ ] <-- Back-edge taken count overflow check. / | / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | | / v || [ ] <-- vector pre header. || | || v || [ ] \ || [ ]_| <-- vector loop. || | | \ v | >[ ] <--- middle-block. | / | | / v -|- >[ ] <--- new preheader. | | | v | [ ] \ | [ ]_| <-- old scalar loop to handle remainder. \ | \ v >[ ] <-- exit block. ... */ BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(BypassBlock && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. OldInduction = Legal->getInduction(); Type *IdxTy = Legal->getWidestInductionType(); // Find the loop boundaries. const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); // The exit count might have the type of i64 while the phi is i32. This can // happen if we have an induction variable that is sign extended before the // compare. The only way that we get a backedge taken count is that the // induction variable was signed and as such will not overflow. In such a case // truncation is legal. if (ExitCount->getType()->getPrimitiveSizeInBits() > IdxTy->getPrimitiveSizeInBits()) ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. ExitCount = SE->getAddExpr(BackedgeTakeCount, SE->getConstant(BackedgeTakeCount->getType(), 1)); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); // We need to test whether the backedge-taken count is uint##_max. Adding one // to it will cause overflow and an incorrect loop trip count in the vector // body. In case of overflow we want to directly jump to the scalar remainder // loop. Value *BackedgeCount = Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), BypassBlock->getTerminator()); if (BackedgeCount->getType()->isPointerTy()) BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, "backedge.ptrcnt.to.int", BypassBlock->getTerminator()); Instruction *CheckBCOverflow = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, Constant::getAllOnesValue(BackedgeCount->getType()), "backedge.overflow", BypassBlock->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. Builder.SetInsertPoint(BypassBlock->getTerminator()); Value *StartIdx = ExtendedIdx = OldInduction ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), IdxTy): ConstantInt::get(IdxTy, 0); // We need an instruction to anchor the overflow check on. StartIdx needs to // be defined before the overflow check branch. Because the scalar preheader // is going to merge the start index and so the overflow branch block needs to // contain a definition of the start index. Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", BypassBlock->getTerminator()); // Count holds the overall loop count (N). Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), BypassBlock->getTerminator()); LoopBypassBlocks.push_back(BypassBlock); // Split the single block loop into the two loop structure described above. BasicBlock *VectorPH = BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); // Create and register the new vector loop. Loop* Lp = new Loop(); Loop *ParentLoop = OrigLoop->getParentLoop(); // Insert the new loop into the loop nest and register the new basic blocks // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); } else { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(VecBody, LI->getBase()); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. Builder.SetInsertPoint(VecBody->getFirstNonPHI()); // Generate the induction variable. setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); Induction = Builder.CreatePHI(IdxTy, 2, "index"); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(IdxTy, VF * UF); // This is the IR builder that we use to add all of the logic for bypassing // the new vector loop. IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); setDebugLocFromInst(BypassBuilder, getDebugLocFromInstOrOperands(OldInduction)); // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. if (Count->getType() != IdxTy) { // The exit count can be of pointer type. Convert it to the correct // integer type. if (ExitCount->getType()->isPointerTy()) Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); else Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); } // Add the start index to the loop count to get the new end index. Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, "end.idx.rnd.down"); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); BasicBlock *LastBypassBlock = BypassBlock; // Generate code to check that the loops trip count that we computed by adding // one to the backedge-taken count will not overflow. { auto PastOverflowCheck = std::next(BasicBlock::iterator(OverflowCheckAnchor)); BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); OldTerm->eraseFromParent(); LastBypassBlock = CheckBlock; } // Generate the code to check that the strides we assumed to be one are really // one. We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. Instruction *StrideCheck; Instruction *FirstCheckInst; std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(LastBypassBlock->getTerminator()); if (StrideCheck) { // Create a new block containing the stride check. BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); OldTerm->eraseFromParent(); Cmp = StrideCheck; LastBypassBlock = CheckBlock; } // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { // Create a new block containing the memory check. BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); OldTerm->eraseFromParent(); Cmp = MemRuntimeCheck; LastBypassBlock = CheckBlock; } LastBypassBlock->getTerminator()->eraseFromParent(); BranchInst::Create(MiddleBlock, VectorPH, Cmp, LastBypassBlock); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. // The starting values of PHI nodes depend on the counter of the last // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. // This variable saves the new starting index for the scalar loop. PHINode *ResumeIndex = nullptr; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); // Set builder to point to last bypass block. BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; LoopVectorizationLegality::InductionInfo II = I->second; Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", MiddleBlock->getTerminator()); // We might have extended the type of the induction variable but we need a // truncated version for the scalar loop. PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", MiddleBlock->getTerminator()) : nullptr; // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", ScalarPH->getTerminator()); BCResumeVal->addIncoming(ResumeVal, MiddleBlock); PHINode *BCTruncResumeVal = nullptr; if (OrigPhi == OldInduction) { BCTruncResumeVal = PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", ScalarPH->getTerminator()); BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); } Value *EndValue = nullptr; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { // Handle the integer induction counter. assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); // We have the canonical induction variable. if (OrigPhi == OldInduction) { // Create a truncated version of the resume value for the scalar loop, // we might have promoted the type to a larger width. EndValue = BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); TruncResumeVal->addIncoming(EndValue, VecBody); BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); // We know what the end value is. EndValue = IdxEndRoundDown; // We also know which PHI node holds it. ResumeIndex = ResumeVal; break; } // Not the canonical induction variable - add the vector loop count to the // start value. Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, II.StartValue->getType(), "cast.crd"); EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); break; } case LoopVectorizationLegality::IK_ReverseIntInduction: { // Convert the CountRoundDown variable to the PHI size. Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, II.StartValue->getType(), "cast.crd"); // Handle reverse integer induction counter. EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, "ptr.ind.end"); break; } case LoopVectorizationLegality::IK_ReversePtrInduction: { // The value at the end of the loop for the reverse pointer is calculated // by creating a GEP with a negative index starting from the start value. Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, "rev.ind.end"); EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, "rev.ptr.ind.end"); break; } }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { if (OrigPhi == OldInduction) ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); else ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); } ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); // The old induction's phi node in the scalar body needs the truncated // value. if (OrigPhi == OldInduction) { BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); } else { BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); } } // If we are generating a new induction variable then we also need to // generate the code that calculates the exit value. This value is not // simply the end of the counter because we may skip the vectorized body // in case of a runtime check. if (!OldInduction){ assert(!ResumeIndex && "Unexpected resume value found"); ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", MiddleBlock->getTerminator()); for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); } // Make sure that we found the index where scalar loop needs to continue. assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && "Invalid resume Index"); // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, ResumeIndex, "cmp.n", MiddleBlock->getTerminator()); BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); // Remove the old terminator. MiddleBlock->getTerminator()->eraseFromParent(); // Create i+1 and fill the PHINode. Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); Induction->addIncoming(StartIdx, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Save the state. LoopVectorPreHeader = VectorPH; LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; LoopVectorBody.push_back(VecBody); LoopScalarBody = OldBasicBlock; LoopVectorizeHints Hints(Lp, true); Hints.setAlreadyVectorized(); } /// This function returns the identity element (or neutral element) for /// the operation K. Constant* LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { switch (K) { case RK_IntegerXor: case RK_IntegerAdd: case RK_IntegerOr: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); case RK_IntegerMult: // Multiplying a number by 1 does not change it. return ConstantInt::get(Tp, 1); case RK_IntegerAnd: // AND-ing a number with an all-1 value does not change it. return ConstantInt::get(Tp, -1, true); case RK_FloatMult: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); case RK_FloatAdd: // Adding zero to a number does not change it. return ConstantFP::get(Tp, 0.0L); default: llvm_unreachable("Unknown reduction kind"); } } /// This function translates the reduction kind to an LLVM binary operator. static unsigned getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { switch (Kind) { case LoopVectorizationLegality::RK_IntegerAdd: return Instruction::Add; case LoopVectorizationLegality::RK_IntegerMult: return Instruction::Mul; case LoopVectorizationLegality::RK_IntegerOr: return Instruction::Or; case LoopVectorizationLegality::RK_IntegerAnd: return Instruction::And; case LoopVectorizationLegality::RK_IntegerXor: return Instruction::Xor; case LoopVectorizationLegality::RK_FloatMult: return Instruction::FMul; case LoopVectorizationLegality::RK_FloatAdd: return Instruction::FAdd; case LoopVectorizationLegality::RK_IntegerMinMax: return Instruction::ICmp; case LoopVectorizationLegality::RK_FloatMinMax: return Instruction::FCmp; default: llvm_unreachable("Unknown reduction operation"); } } Value *createMinMaxOp(IRBuilder<> &Builder, LoopVectorizationLegality::MinMaxReductionKind RK, Value *Left, Value *Right) { CmpInst::Predicate P = CmpInst::ICMP_NE; switch (RK) { default: llvm_unreachable("Unknown min/max reduction kind"); case LoopVectorizationLegality::MRK_UIntMin: P = CmpInst::ICMP_ULT; break; case LoopVectorizationLegality::MRK_UIntMax: P = CmpInst::ICMP_UGT; break; case LoopVectorizationLegality::MRK_SIntMin: P = CmpInst::ICMP_SLT; break; case LoopVectorizationLegality::MRK_SIntMax: P = CmpInst::ICMP_SGT; break; case LoopVectorizationLegality::MRK_FloatMin: P = CmpInst::FCMP_OLT; break; case LoopVectorizationLegality::MRK_FloatMax: P = CmpInst::FCMP_OGT; break; } Value *Cmp; if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); else Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); return Select; } namespace { struct CSEDenseMapInfo { static bool canHandle(Instruction *I) { return isa(I) || isa(I) || isa(I) || isa(I); } static inline Instruction *getEmptyKey() { return DenseMapInfo::getEmptyKey(); } static inline Instruction *getTombstoneKey() { return DenseMapInfo::getTombstoneKey(); } static unsigned getHashValue(Instruction *I) { assert(canHandle(I) && "Unknown instruction!"); return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), I->value_op_end())); } static bool isEqual(Instruction *LHS, Instruction *RHS) { if (LHS == getEmptyKey() || RHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getTombstoneKey()) return LHS == RHS; return LHS->isIdenticalTo(RHS); } }; } /// \brief Check whether this block is a predicated block. /// Due to if predication of stores we might create a sequence of "if(pred) a[i] /// = ...; " blocks. We start with one vectorized basic block. For every /// conditional block we split this vectorized block. Therefore, every second /// block will be a predicated one. static bool isPredicatedBlock(unsigned BlockNum) { return BlockNum % 2; } ///\brief Perform cse of induction variable instructions. static void cse(SmallVector &BBs) { // Perform simple cse. SmallDenseMap CSEMap; for (unsigned i = 0, e = BBs.size(); i != e; ++i) { BasicBlock *BB = BBs[i]; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { Instruction *In = I++; if (!CSEDenseMapInfo::canHandle(In)) continue; // Check if we can replace this instruction with any of the // visited instructions. if (Instruction *V = CSEMap.lookup(In)) { In->replaceAllUsesWith(V); In->eraseFromParent(); continue; } // Ignore instructions in conditional blocks. We create "if (pred) a[i] = // ...;" blocks for predicated stores. Every second block is a predicated // block. if (isPredicatedBlock(i)) continue; CSEMap[In] = In; } } } /// \brief Adds a 'fast' flag to floating point operations. static Value *addFastMathFlag(Value *V) { if (isa(V)){ FastMathFlags Flags; Flags.setUnsafeAlgebra(); cast(V)->setFastMathFlags(Flags); } return V; } void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // // Notice: any optimization or new instruction that go // into the code below should be also be implemented in // the cost-model. // //===------------------------------------------------===// Constant *Zero = Builder.getInt32(0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two // stages. First, we create a new vector PHI node with no incoming edges. // We use this value when we vectorize all of the instructions that use the // PHI. Next, after all of the instructions in the block are complete we // add the new incoming edges to the PHI. At this point all of the // instructions in the basic block are vectorized, so we can use them to // construct the PHI. PhiVector RdxPHIsToFix; // Scan the loop in a topological order to ensure that defs are vectorized // before users. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); // Vectorize all of the blocks in the original loop. for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) vectorizeBlockInLoop(*bb, &RdxPHIsToFix); // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did // not want to introduce cycles. Notice that the remaining PHI nodes // that we need to fix are reduction variables. // Create the 'reduced' values for each of the induction vars. // The reduced values are the vector values that we scalarize and combine // after the loop is finished. for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; setDebugLocFromInst(Builder, RdxDesc.StartValue); // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. Value *Identity; Value *VectorStart; if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. if (VF == 1) { VectorStart = Identity = RdxDesc.StartValue; } else { VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, "minmax.ident"); } } else { // Handle other reduction kinds: Constant *Iden = LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); if (VF == 1) { Identity = Iden; // This vector is the Identity vector where the first element is the // incoming scalar reduction. VectorStart = RdxDesc.StartValue; } else { Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. VectorStart = Builder.CreateInsertElement(Identity, RdxDesc.StartValue, Zero); } } // Fix the vector-loop phi. // Reductions do not have to start at zero. They can start with // any loop invariant values. VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); VectorParts &Val = getVectorValue(LoopVal); for (unsigned part = 0; part < UF; ++part) { // Make sure to add the reduction stat value only to the // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; cast(VecRdxPhi[part])->addIncoming(StartVal, LoopVectorPreHeader); cast(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody.back()); } // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); VectorParts RdxParts; setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); for (unsigned part = 0; part < UF; ++part) { // This PHINode contains the vectorized reduction variable, or // the initial value vector, if we bypass the vector loop. VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); Value *StartVal = (part == 0) ? VectorStart : Identity; for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody.back()); RdxParts.push_back(NewPhi); } // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; unsigned Op = getReductionBinOp(RdxDesc.Kind); setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) // Floating point operations had to be 'fast' to enable the reduction. ReducedPartRdx = addFastMathFlag( Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], ReducedPartRdx, "bin.rdx")); else ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, ReducedPartRdx, RdxParts[part]); } if (VF > 1) { // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = ReducedPartRdx; SmallVector ShuffleMask(VF, nullptr); for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i/2; ++j) ShuffleMask[j] = Builder.getInt32(i/2 + j); // Fill the rest of the mask with undef. std::fill(&ShuffleMask[i/2], ShuffleMask.end(), UndefValue::get(Builder.getInt32Ty())); Value *Shuf = Builder.CreateShuffleVector(TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) // Floating point operations had to be 'fast' to enable the reduction. TmpVec = addFastMathFlag(Builder.CreateBinOp( (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); else TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); } // The result is in the first element of the vector. ReducedPartRdx = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the // PHI nodes in the exit blocks. for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast(LEI); if (!LCSSAPhi) break; // All PHINodes need to have a single entry edge, or two if // we already fixed them. assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); // We found our reduction value exit-PHI. Update it with the // incoming bypass edge. if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); break; } }// end of the LCSSA phi scan. // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. fixLCSSAPHIs(); // Remove redundant induction instructions. cse(LoopVectorBody); } void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast(LEI); if (!LCSSAPhi) break; if (LCSSAPhi->getNumIncomingValues() == 1) LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), LoopMiddleBlock); } } InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && "Invalid edge"); // Look for cached value. std::pair Edge(Src, Dst); EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); if (ECEntryIt != MaskCache.end()) return ECEntryIt->second; VectorParts SrcMask = createBlockInMask(Src); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast(Src->getTerminator()); assert(BI && "Unexpected terminator found"); if (BI->isConditional()) { VectorParts EdgeMask = getVectorValue(BI->getCondition()); if (BI->getSuccessor(0) != Dst) for (unsigned part = 0; part < UF; ++part) EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); for (unsigned part = 0; part < UF; ++part) EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); MaskCache[Edge] = EdgeMask; return EdgeMask; } MaskCache[Edge] = SrcMask; return SrcMask; } InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Loop incoming mask is all-one. if (OrigLoop->getHeader() == BB) { Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); return getVectorValue(C); } // This is the block mask. We OR all incoming edges, and with zero. Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); VectorParts BlockMask = getVectorValue(Zero); // For each pred: for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { VectorParts EM = createEdgeMask(*it, BB); for (unsigned part = 0; part < UF; ++part) BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); } return BlockMask; } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); // Handle reduction variables: if (Legal->getReductionVars()->count(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", LoopVectorBody.back()-> getFirstInsertionPt()); } PV->push_back(P); return; } setDebugLocFromInst(Builder, P); // Check for PHI nodes that are lowered to vector selects. if (P->getParent() != OrigLoop->getHeader()) { // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. unsigned NumIncoming = P->getNumIncomingValues(); // Generate a sequence of selects of the form: // SELECT(Mask3, In3, // SELECT(Mask2, In2, // ( ...))) for (unsigned In = 0; In < NumIncoming; In++) { VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), P->getParent()); VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); for (unsigned part = 0; part < UF; ++part) { // We might have single edge PHIs (blocks) - use an identity // 'select' for the first PHI operand. if (In == 0) Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]); else // Select between the current value and the previous incoming edge // based on the incoming mask. Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part], "predphi"); } } return; } // This PHINode must be an induction variable. // Make sure that we know about it. assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); LoopVectorizationLegality::InductionInfo II = Legal->getInductionVars()->lookup(P); switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { assert(P->getType() == II.StartValue->getType() && "Types must match"); Type *PhiTy = P->getType(); Value *Broadcasted; if (P == OldInduction) { // Handle the canonical induction variable. We might have had to // extend the type. Broadcasted = Builder.CreateTrunc(Induction, PhiTy); } else { // Handle other induction variables that are now based on the // canonical one. Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, "offset.idx"); } Broadcasted = getBroadcastInstrs(Broadcasted); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); return; } case LoopVectorizationLegality::IK_ReverseIntInduction: case LoopVectorizationLegality::IK_PtrInduction: case LoopVectorizationLegality::IK_ReversePtrInduction: // Handle reverse integer and pointer inductions. Value *StartIdx = ExtendedIdx; // This is the normalized GEP that starts counting at zero. Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, "normalized.idx"); // Handle the reverse integer induction variable case. if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { IntegerType *DstTy = cast(II.StartValue->getType()); Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, "resize.norm.idx"); Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, "reverse.idx"); // This is a new value so do not hoist it out. Value *Broadcasted = getBroadcastInstrs(ReverseInd); // After broadcasting the induction variable we need to make the // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, true); return; } // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // Is this a reverse induction ptr or a consecutive induction ptr. bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == II.IK); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { int EltIndex = (part) * (Reverse ? -1 : 1); Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); Value *GlobalIdx; if (Reverse) GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); else GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep"); Entry[part] = SclrGep; continue; } Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); Value *GlobalIdx; if (!Reverse) GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); else GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, Builder.getInt32(i), "insert.gep"); } Entry[part] = VecVal; } return; } } void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { VectorParts &Entry = WidenMap.get(it); switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; case Instruction::PHI:{ // Vectorize PHINodes. widenPHIInstruction(it, Entry, UF, VF, PV); continue; }// End of PHI. case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { // Just widen binops. BinaryOperator *BinOp = dyn_cast(it); setDebugLocFromInst(Builder, BinOp); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); // Use this vector value for all users of the original instruction. for (unsigned Part = 0; Part < UF; ++Part) { Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); if (BinaryOperator *VecOp = dyn_cast(V)) VecOp->copyIRFlags(BinOp); Entry[Part] = V; } propagateMetadata(Entry, it); break; } case Instruction::Select: { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), OrigLoop); setDebugLocFromInst(Builder, it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); Value *ScalarCond = (VF == 1) ? Cond[0] : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } propagateMetadata(Entry, it); break; } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast(it); setDebugLocFromInst(Builder, it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; if (FCmp) C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); else C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); Entry[Part] = C; } propagateMetadata(Entry, it); break; } case Instruction::Store: case Instruction::Load: vectorizeMemoryInstruction(it); break; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast(it); setDebugLocFromInst(Builder, it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, /// c. other casts depend on pointer size. if (CI->getOperand(0) == OldInduction && it->getOpcode() == Instruction::Trunc) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); propagateMetadata(Entry, it); break; } /// Vectorize casts. Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); propagateMetadata(Entry, it); break; } case Instruction::Call: { // Ignore dbg intrinsics. if (isa(it)) break; setDebugLocFromInst(Builder, it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast(it); Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); assert(ID && "Not an intrinsic call!"); switch (ID) { case Intrinsic::assume: case Intrinsic::lifetime_end: case Intrinsic::lifetime_start: scalarizeInstruction(it); break; default: bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { if (HasScalarOpd && i == 1) { Args.push_back(CI->getArgOperand(i)); continue; } VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } Type *Tys[] = {CI->getType()}; if (VF > 1) Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } propagateMetadata(Entry, it); break; } break; } default: // All other instructions are unsupported. Scalarize them. scalarizeInstruction(it); break; }// end of switch. }// end of for_each instr. } void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. SE->forgetLoop(OrigLoop); // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); // Due to if predication of stores we might create a sequence of "if(pred) // a[i] = ...; " blocks. for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { if (i == 0) DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); else if (isPredicatedBlock(i)) { DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); } else { DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); } } DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); DEBUG(DT->verifyDomTree()); } /// \brief Check whether it is safe to if-convert this phi node. /// /// Phi nodes with constant expressions that can trap are not safe to if /// convert. static bool canIfConvertPHINodes(BasicBlock *BB) { for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { PHINode *Phi = dyn_cast(I); if (!Phi) return true; for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) if (Constant *C = dyn_cast(Phi->getIncomingValue(p))) if (C->canTrap()) return false; } return true; } bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { emitAnalysis(Report() << "if-conversion is disabled"); return false; } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); // A list of pointers that we can safely read and write to. SmallPtrSet SafePointes; // Collect safe addresses. for (Loop::block_iterator BI = TheLoop->block_begin(), BE = TheLoop->block_end(); BI != BE; ++BI) { BasicBlock *BB = *BI; if (blockNeedsPredication(BB)) continue; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { if (LoadInst *LI = dyn_cast(I)) SafePointes.insert(LI->getPointerOperand()); else if (StoreInst *SI = dyn_cast(I)) SafePointes.insert(SI->getPointerOperand()); } } // Collect the blocks that need predication. BasicBlock *Header = TheLoop->getHeader(); for (Loop::block_iterator BI = TheLoop->block_begin(), BE = TheLoop->block_end(); BI != BE; ++BI) { BasicBlock *BB = *BI; // We don't support switch statements inside loops. if (!isa(BB->getTerminator())) { emitAnalysis(Report(BB->getTerminator()) << "loop contains a switch statement"); return false; } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { emitAnalysis(Report(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { emitAnalysis(Report(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } } // We can if-convert this loop. return true; } bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { emitAnalysis( Report() << "loop control flow is not understood by vectorizer"); return false; } // We can only vectorize innermost loops. if (TheLoop->getSubLoopsVector().size()) { emitAnalysis(Report() << "loop is not the innermost loop"); return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { emitAnalysis( Report() << "loop control flow is not understood by vectorizer"); return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { emitAnalysis( Report() << "loop control flow is not understood by vectorizer"); return false; } // We only handle bottom-tested loops, i.e. loop in which the condition is // checked at the end of each iteration. With that we can assume that all // instructions in the loop are executed the same number of times. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { emitAnalysis( Report() << "loop control flow is not understood by vectorizer"); return false; } // We need to have a loop header. DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'); // Check if we can if-convert non-single-bb loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); return false; } // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { emitAnalysis(Report() << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); return false; } // Go over each instruction and look at memory deps. if (!canVectorizeMemory()) { DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); return false; } // Collect all of the variables that remain uniform after vectorization. collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. return true; } static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); // It is possible that char's or short's overflow when we ask for the loop's // trip count, work around this by changing the type size. if (Ty->getScalarSizeInBits() < 32) return Type::getInt32Ty(Ty->getContext()); return Ty; } static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { Ty0 = convertPointerToIntegerType(DL, Ty0); Ty1 = convertPointerToIntegerType(DL, Ty1); if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) return Ty0; return Ty1; } /// \brief Check that the instruction has outside loop users and is not an /// identified reduction variable. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl &Reductions) { // Reduction instructions are allowed to have exit users. All other // instructions must not have external users. if (!Reductions.count(Inst)) //Check that all of the users of the loop are inside the BB. for (User *U : Inst->users()) { Instruction *UI = cast(U); // This user may be a reduction exit value. if (!TheLoop->contains(UI)) { DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); return true; } } return false; } bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); if (F.hasFnAttribute("no-nans-fp-math")) HasFunNoNaNAttr = F.getAttributes().getAttribute( AttributeSet::FunctionIndex, "no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { // Scan the instructions in the block and look for hazards. for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; ++it) { if (PHINode *Phi = dyn_cast(it)) { Type *PhiTy = Phi->getType(); // Check that this PHI type is allowed. if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { emitAnalysis(Report(it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } // If this PHINode is not in the header block, then we know that we // can convert it to select during if-conversion. No need to check if // the PHIs in this block are induction or reduction variables. if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; emitAnalysis(Report(it) << "value could not be identified as " "an induction or reduction variable"); return false; } // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { emitAnalysis(Report(it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); // Check if this is an induction variable. InductionKind IK = isInductionVariable(Phi); if (IK_NoInduction != IK) { // Get the widest type. if (!WidestIndTy) WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); else WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. if (IK == IK_IntInduction) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other // than it is expedient). if (!Induction || PhiTy == WidestIndTy) Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); Inductions[Phi] = InductionInfo(StartValue, IK); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { emitAnalysis(Report(it) << "use of induction value outside of the " "loop is not handled by vectorizer"); return false; } continue; } if (AddReductionVar(Phi, RK_IntegerAdd)) { DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_IntegerMult)) { DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_IntegerOr)) { DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_IntegerAnd)) { DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_IntegerXor)) { DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_IntegerMinMax)) { DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_FloatMult)) { DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_FloatAdd)) { DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); continue; } if (AddReductionVar(Phi, RK_FloatMinMax)) { DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi << "\n"); continue; } emitAnalysis(Report(it) << "value that could not be identified as " "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling // We still don't handle functions. However, we can ignore dbg intrinsic // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast(it); if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI)) { emitAnalysis(Report(it) << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the // second argument is the same (i.e. loop invariant) if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { emitAnalysis(Report(it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; } } // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa(it)) { emitAnalysis(Report(it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } // Check that the stored type is vectorizable. if (StoreInst *ST = dyn_cast(it)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); return false; } if (EnableMemAccessVersioning) collectStridedAccess(ST); } if (EnableMemAccessVersioning) if (LoadInst *LI = dyn_cast(it)) collectStridedAccess(LI); // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { emitAnalysis(Report(it) << "value cannot be used outside the loop"); return false; } } // next instr. } if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { emitAnalysis(Report() << "loop induction variable could not be identified"); return false; } } return true; } ///\brief Remove GEPs whose indices but the last one are loop invariant and /// return the induction operand of the gep pointer. static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, const DataLayout *DL, Loop *Lp) { GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) return Ptr; unsigned InductionOperand = getGEPInductionOperand(DL, GEP); // Check that all of the gep indices are uniform except for our induction // operand. for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) if (i != InductionOperand && !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) return Ptr; return GEP->getOperand(InductionOperand); } ///\brief Look for a cast use of the passed value. static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { Value *UniqueCast = nullptr; for (User *U : Ptr->users()) { CastInst *CI = dyn_cast(U); if (CI && CI->getType() == Ty) { if (!UniqueCast) UniqueCast = CI; else return nullptr; } } return UniqueCast; } ///\brief Get the stride of a pointer access in a loop. /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a /// pointer to the Value, or null otherwise. static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, const DataLayout *DL, Loop *Lp) { const PointerType *PtrTy = dyn_cast(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) return nullptr; // Try to remove a gep instruction to make the pointer (actually index at this // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the // pointer, otherwise, we are analyzing the index. Value *OrigPtr = Ptr; // The size of the pointer access. int64_t PtrAccessSize = 1; Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); const SCEV *V = SE->getSCEV(Ptr); if (Ptr != OrigPtr) // Strip off casts. while (const SCEVCastExpr *C = dyn_cast(V)) V = C->getOperand(); const SCEVAddRecExpr *S = dyn_cast(V); if (!S) return nullptr; V = S->getStepRecurrence(*SE); if (!V) return nullptr; // Strip off the size of access multiplication if we are still analyzing the // pointer. if (OrigPtr == Ptr) { DL->getTypeAllocSize(PtrTy->getElementType()); if (const SCEVMulExpr *M = dyn_cast(V)) { if (M->getOperand(0)->getSCEVType() != scConstant) return nullptr; const APInt &APStepVal = cast(M->getOperand(0))->getValue()->getValue(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) return nullptr; int64_t StepVal = APStepVal.getSExtValue(); if (PtrAccessSize != StepVal) return nullptr; V = M->getOperand(1); } } // Strip off casts. Type *StripedOffRecurrenceCast = nullptr; if (const SCEVCastExpr *C = dyn_cast(V)) { StripedOffRecurrenceCast = C->getType(); V = C->getOperand(); } // Look for the loop invariant symbolic value. const SCEVUnknown *U = dyn_cast(V); if (!U) return nullptr; Value *Stride = U->getValue(); if (!Lp->isLoopInvariant(Stride)) return nullptr; // If we have stripped off the recurrence cast we have to make sure that we // return the value that is used in this loop so that we can replace it later. if (StripedOffRecurrenceCast) Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast); return Stride; } void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) Ptr = LI->getPointerOperand(); else if (StoreInst *SI = dyn_cast(MemAccess)) Ptr = SI->getPointerOperand(); else return; Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); if (!Stride) return; DEBUG(dbgs() << "LV: Found a strided access that we can version"); DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); Strides[Ptr] = Stride; StrideSet.insert(Stride); } void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. std::vector Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); // Start with the conditional branch and walk up the block. Worklist.push_back(Latch->getTerminator()->getOperand(0)); // Also add all consecutive pointer values; these values will be uniform // after vectorization (and subsequent cleanup) and, until revectorization is // supported, all dependencies must also be uniform. for (Loop::block_iterator B = TheLoop->block_begin(), BE = TheLoop->block_end(); B != BE; ++B) for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) if (I->getType()->isPointerTy() && isConsecutivePtr(I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); while (Worklist.size()) { Instruction *I = dyn_cast(Worklist.back()); Worklist.pop_back(); // Look at instructions inside this loop. // Stop when reaching PHI nodes. // TODO: we need to follow values all over the loop, not only in this block. if (!I || !TheLoop->contains(I) || isa(I)) continue; // This is a known uniform. Uniforms.insert(I); // Insert all operands. Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); } } namespace { /// \brief Analyses memory accesses in a loop. /// /// Checks whether run time pointer checks are needed and builds sets for data /// dependence checking. class AccessAnalysis { public: /// \brief Read or write access location. typedef PointerIntPair MemAccessInfo; typedef SmallPtrSet MemAccessInfoSet; /// \brief Set of potential dependent memory accesses. typedef EquivalenceClasses DepCandidates; AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} /// \brief Register a load and whether it is only read from. void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { Value *Ptr = const_cast(Loc.Ptr); AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); Accesses.insert(MemAccessInfo(Ptr, false)); if (IsReadOnly) ReadOnlyPtr.insert(Ptr); } /// \brief Register a store. void addStore(AliasAnalysis::Location &Loc) { Value *Ptr = const_cast(Loc.Ptr); AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); Accesses.insert(MemAccessInfo(Ptr, true)); } /// \brief Check whether we can check the pointers at runtime for /// non-intersection. bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, ValueToValueMap &Strides, bool ShouldCheckStride = false); /// \brief Goes over all memory accesses, checks whether a RT check is needed /// and builds sets of dependent accesses. void buildDependenceSets() { processMemAccesses(); } bool isRTCheckNeeded() { return IsRTCheckNeeded; } bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } void resetDepChecks() { CheckDeps.clear(); } MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } private: typedef SetVector PtrAccessSet; /// \brief Go over all memory access and check whether runtime pointer checks /// are needed /// and build sets of dependency check candidates. void processMemAccesses(); /// Set of all accesses. PtrAccessSet Accesses; /// Set of accesses that need a further dependence check. MemAccessInfoSet CheckDeps; /// Set of pointers that are read only. SmallPtrSet ReadOnlyPtr; const DataLayout *DL; /// An alias set tracker to partition the access set by underlying object and //intrinsic property (such as TBAA metadata). AliasSetTracker AST; /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. DepCandidates &DepCands; bool IsRTCheckNeeded; }; } // end anonymous namespace /// \brief Check whether a pointer can participate in a runtime bounds check. static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, Value *Ptr) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast(PtrScev); if (!AR) return false; return AR->isAffine(); } /// \brief Check the stride of the pointer and ensure that it does not wrap in /// the address space. static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, const Loop *Lp, ValueToValueMap &StridesMap); bool AccessAnalysis::canCheckPtrAtRT( LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, ValueToValueMap &StridesMap, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. bool CanDoRT = true; bool IsDepCheckNeeded = isDependencyCheckNeeded(); NumComparisons = 0; // We assign a consecutive id to access from different alias sets. // Accesses between different groups doesn't need to be checked. unsigned ASId = 1; for (auto &AS : AST) { unsigned NumReadPtrChecks = 0; unsigned NumWritePtrChecks = 0; // We assign consecutive id to access from different dependence sets. // Accesses within the same set don't need a runtime check. unsigned RunningDepId = 1; DenseMap DepSetId; for (auto A : AS) { Value *Ptr = A.getValue(); bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); MemAccessInfo Access(Ptr, IsWrite); if (IsWrite) ++NumWritePtrChecks; else ++NumReadPtrChecks; if (hasComputableBounds(SE, StridesMap, Ptr) && // When we run after a failing dependency check we have to make sure we // don't have wrapping pointers. (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { // The id of the dependence set. unsigned DepId; if (IsDepCheckNeeded) { Value *Leader = DepCands.getLeaderValue(Access).getPointer(); unsigned &LeaderId = DepSetId[Leader]; if (!LeaderId) LeaderId = RunningDepId++; DepId = LeaderId; } else // Each access has its own dependence set. DepId = RunningDepId++; RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); } else { CanDoRT = false; } } if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) NumComparisons += 0; // Only one dependence set. else { NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + NumWritePtrChecks - 1)); } ++ASId; } // If the pointers that we would use for the bounds comparison have different // address spaces, assume the values aren't directly comparable, so we can't // use them for the runtime check. We also have to assume they could // overlap. In the future there should be metadata for whether address spaces // are disjoint. unsigned NumPointers = RtCheck.Pointers.size(); for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i + 1; j < NumPointers; ++j) { // Only need to check pointers between two different dependency sets. if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) continue; // Only need to check pointers in the same alias set. if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) continue; Value *PtrI = RtCheck.Pointers[i]; Value *PtrJ = RtCheck.Pointers[j]; unsigned ASi = PtrI->getType()->getPointerAddressSpace(); unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); if (ASi != ASj) { DEBUG(dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"); return false; } } } return CanDoRT; } void AccessAnalysis::processMemAccesses() { // We process the set twice: first we process read-write pointers, last we // process read-only pointers. This allows us to skip dependence tests for // read-only pointers. DEBUG(dbgs() << "LV: Processing memory accesses...\n"); DEBUG(dbgs() << " AST: "; AST.dump()); DEBUG(dbgs() << "LV: Accesses:\n"); DEBUG({ for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }); // The AliasSetTracker has nicely partitioned our pointers by metadata // compatibility and potential for underlying-object overlap. As a result, we // only need to check for potential pointer dependencies within each alias // set. for (auto &AS : AST) { // Note that both the alias-set tracker and the alias sets themselves used // linked lists internally and so the iteration order here is deterministic // (matching the original instruction order within each set). bool SetHasWrite = false; // Map of pointers to last access encountered. typedef DenseMap UnderlyingObjToAccessMap; UnderlyingObjToAccessMap ObjToLastAccess; // Set of access to check after all writes have been processed. PtrAccessSet DeferredAccesses; // Iterate over each alias set twice, once to process read/write pointers, // and then to process read-only pointers. for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { bool UseDeferred = SetIteration > 0; PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; for (auto AV : AS) { Value *Ptr = AV.getValue(); // For a single memory access in AliasSetTracker, Accesses may contain // both read and write, and they both need to be handled for CheckDeps. for (auto AC : S) { if (AC.getPointer() != Ptr) continue; bool IsWrite = AC.getInt(); // If we're using the deferred access set, then it contains only // reads. bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; if (UseDeferred && !IsReadOnlyPtr) continue; // Otherwise, the pointer must be in the PtrAccessSet, either as a // read or a write. assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?"); MemAccessInfo Access(Ptr, IsWrite); DepCands.insert(Access); // Memorize read-only pointers for later processing and skip them in // the first round (they need to be checked after we have seen all // write pointers). Note: we also mark pointer that are not // consecutive as "read-only" pointers (so that we check // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". if (!UseDeferred && IsReadOnlyPtr) { DeferredAccesses.insert(Access); continue; } // If this is a write - check other reads and writes for conflicts. If // this is a read only check other writes for conflicts (but only if // there is no other write to the ptr - this is an optimization to // catch "a[i] = a[i] + " without having to do a dependence check). if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { CheckDeps.insert(Access); IsRTCheckNeeded = true; } if (IsWrite) SetHasWrite = true; // Create sets of pointers connected by a shared alias set and // underlying object. typedef SmallVector ValueVector; ValueVector TempObjects; GetUnderlyingObjects(Ptr, TempObjects, DL); for (Value *UnderlyingObj : TempObjects) { UnderlyingObjToAccessMap::iterator Prev = ObjToLastAccess.find(UnderlyingObj); if (Prev != ObjToLastAccess.end()) DepCands.unionSets(Access, Prev->second); ObjToLastAccess[UnderlyingObj] = Access; } } } } } } namespace { /// \brief Checks memory dependences among accesses to the same underlying /// object to determine whether there vectorization is legal or not (and at /// which vectorization factor). /// /// This class works under the assumption that we already checked that memory /// locations with different underlying pointers are "must-not alias". /// We use the ScalarEvolution framework to symbolically evalutate access /// functions pairs. Since we currently don't restructure the loop we can rely /// on the program order of memory accesses to determine their safety. /// At the moment we will only deem accesses as safe for: /// * A negative constant distance assuming program order. /// /// Safe: tmp = a[i + 1]; OR a[i + 1] = x; /// a[i] = tmp; y = a[i]; /// /// The latter case is safe because later checks guarantuee that there can't /// be a cycle through a phi node (that is, we check that "x" and "y" is not /// the same variable: a header phi can only be an induction or a reduction, a /// reduction can't have a memory sink, an induction can't have a memory /// source). This is important and must not be violated (or we have to /// resort to checking for cycles through memory). /// /// * A positive constant distance assuming program order that is bigger /// than the biggest memory access. /// /// tmp = a[i] OR b[i] = x /// a[i+2] = tmp y = b[i+2]; /// /// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. /// /// * Zero distances and all accesses have the same size. /// class MemoryDepChecker { public: typedef PointerIntPair MemAccessInfo; typedef SmallPtrSet MemAccessInfoSet; MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), ShouldRetryWithRuntimeCheck(false) {} /// \brief Register the location (instructions are given increasing numbers) /// of a write access. void addAccess(StoreInst *SI) { Value *Ptr = SI->getPointerOperand(); Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); InstMap.push_back(SI); ++AccessIdx; } /// \brief Register the location (instructions are given increasing numbers) /// of a write access. void addAccess(LoadInst *LI) { Value *Ptr = LI->getPointerOperand(); Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); InstMap.push_back(LI); ++AccessIdx; } /// \brief Check whether the dependencies between the accesses are safe. /// /// Only checks sets with elements in \p CheckDeps. bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); /// \brief The maximum number of bytes of a vector register we can vectorize /// the accesses safely with. unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } /// \brief In same cases when the dependency check fails we can still /// vectorize the loop with a dynamic array access check. bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } private: ScalarEvolution *SE; const DataLayout *DL; const Loop *InnermostLoop; /// \brief Maps access locations (ptr, read/write) to program order. DenseMap > Accesses; /// \brief Memory access instructions in program order. SmallVector InstMap; /// \brief The program order index to be used for the next instruction. unsigned AccessIdx; // We can access this many bytes in parallel safely. unsigned MaxSafeDepDistBytes; /// \brief If we see a non-constant dependence distance we can still try to /// vectorize this loop with runtime checks. bool ShouldRetryWithRuntimeCheck; /// \brief Check whether there is a plausible dependence between the two /// accesses. /// /// Access \p A must happen before \p B in program order. The two indices /// identify the index into the program order map. /// /// This function checks whether there is a plausible dependence (or the /// absence of such can't be proved) between the two accesses. If there is a /// plausible dependence but the dependence distance is bigger than one /// element access it records this distance in \p MaxSafeDepDistBytes (if this /// distance is smaller than any other distance encountered so far). /// Otherwise, this function returns true signaling a possible dependence. bool isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, ValueToValueMap &Strides); /// \brief Check whether the data dependence could prevent store-load /// forwarding. bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); }; } // end anonymous namespace static bool isInBoundsGep(Value *Ptr) { if (GetElementPtrInst *GEP = dyn_cast(Ptr)) return GEP->isInBounds(); return false; } /// \brief Check whether the access through \p Ptr has a constant stride. static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, const Loop *Lp, ValueToValueMap &StridesMap) { const Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to aggregate types. const PointerType *PtrTy = cast(Ty); if (PtrTy->getElementType()->isAggregateType()) { DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"); return 0; } const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); const SCEVAddRecExpr *AR = dyn_cast(PtrScev); if (!AR) { DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << *Ptr << " SCEV: " << *PtrScev << "\n"); return 0; } // The accesss function must stride over the innermost loop. if (Lp != AR->getLoop()) { DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n"); } // The address calculation must not wrap. Otherwise, a dependence could be // inverted. // An inbounds getelementptr that is a AddRec with a unit stride // cannot wrap per definition. The unit stride requirement is checked later. // An getelementptr without an inbounds attribute and unit stride would have // to access the pointer value "0" which is undefined behavior in address // space 0, therefore we can also vectorize this case. bool IsInBoundsGEP = isInBoundsGep(Ptr); bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << *Ptr << " SCEV: " << *PtrScev << "\n"); return 0; } // Check the step is constant. const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); if (!C) { DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << " SCEV: " << *PtrScev << "\n"); return 0; } int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); const APInt &APStepVal = C->getValue()->getValue(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) return 0; int64_t StepVal = APStepVal.getSExtValue(); // Strided access. int64_t Stride = StepVal / Size; int64_t Rem = StepVal % Size; if (Rem) return 0; // If the SCEV could wrap but we have an inbounds gep with a unit stride we // know we can't "wrap around the address space". In case of address space // zero we know that this won't happen without triggering undefined behavior. if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && Stride != 1 && Stride != -1) return 0; return Stride; } bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize) { // If loads occur at a distance that is not a multiple of a feasible vector // factor store-load forwarding does not take place. // Positive dependences might cause troubles because vectorizing them might // prevent store-load forwarding making vectorized code run a lot slower. // a[i] = a[i-3] ^ a[i-8]; // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and // hence on your typical architecture store-load forwarding does not take // place. Vectorizing in such cases does not make sense. // Store-load forwarding distance. const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; // Maximum vector factor. unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; vf *= 2) { if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { MaxVFWithoutSLForwardIssues = (vf >>=1); break; } } if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { DEBUG(dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n"); return true; } if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; return false; } bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, ValueToValueMap &Strides) { assert (AIdx < BIdx && "Must pass arguments in program order"); Value *APtr = A.getPointer(); Value *BPtr = B.getPointer(); bool AIsWrite = A.getInt(); bool BIsWrite = B.getInt(); // Two reads are independent. if (!AIsWrite && !BIsWrite) return false; // We cannot check pointers in different address spaces. if (APtr->getType()->getPointerAddressSpace() != BPtr->getType()->getPointerAddressSpace()) return true; const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); const SCEV *Src = AScev; const SCEV *Sink = BScev; // If the induction step is negative we have to invert source and sink of the // dependence. if (StrideAPtr < 0) { //Src = BScev; //Sink = AScev; std::swap(APtr, BPtr); std::swap(Src, Sink); std::swap(AIsWrite, BIsWrite); std::swap(AIdx, BIdx); std::swap(StrideAPtr, StrideBPtr); } const SCEV *Dist = SE->getMinusSCEV(Sink, Src); DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink << "(Induction step: " << StrideAPtr << ")\n"); DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " << *InstMap[BIdx] << ": " << *Dist << "\n"); // Need consecutive accesses. We don't want to vectorize // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in // the address space. if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ DEBUG(dbgs() << "Non-consecutive pointer access\n"); return true; } const SCEVConstant *C = dyn_cast(Dist); if (!C) { DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); ShouldRetryWithRuntimeCheck = true; return true; } Type *ATy = APtr->getType()->getPointerElementType(); Type *BTy = BPtr->getType()->getPointerElementType(); unsigned TypeByteSize = DL->getTypeAllocSize(ATy); // Negative distances are not plausible dependencies. const APInt &Val = C->getValue()->getValue(); if (Val.isNegative()) { bool IsTrueDataDependence = (AIsWrite && !BIsWrite); if (IsTrueDataDependence && (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || ATy != BTy)) return true; DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); return false; } // Write to the same location with the same size. // Could be improved to assert type sizes are the same (i32 == float, etc). if (Val == 0) { if (ATy == BTy) return false; DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); return true; } assert(Val.isStrictlyPositive() && "Expect a positive value"); // Positive distance bigger than max vectorization factor. if (ATy != BTy) { DEBUG(dbgs() << "LV: ReadWrite-Write positive dependency with different types\n"); return false; } unsigned Distance = (unsigned) Val.getZExtValue(); // Bail out early if passed-in parameters make vectorization not feasible. unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; // The distance must be bigger than the size needed for a vectorized version // of the operation and the size of the vectorized operation must not be // bigger than the currrent maximum size. if (Distance < 2*TypeByteSize || 2*TypeByteSize > MaxSafeDepDistBytes || Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { DEBUG(dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'); return true; } MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? Distance : MaxSafeDepDistBytes; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && couldPreventStoreLoadForward(Distance, TypeByteSize)) return true; DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); return false; } bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides) { MaxSafeDepDistBytes = -1U; while (!CheckDeps.empty()) { MemAccessInfo CurAccess = *CheckDeps.begin(); // Get the relevant memory access set. EquivalenceClasses::iterator I = AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); // Check accesses within this set. EquivalenceClasses::member_iterator AI, AE; AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); // Check every access pair. while (AI != AE) { CheckDeps.erase(*AI); EquivalenceClasses::member_iterator OI = std::next(AI); while (OI != AE) { // Check every accessing instruction pair in program order. for (std::vector::iterator I1 = Accesses[*AI].begin(), I1E = Accesses[*AI].end(); I1 != I1E; ++I1) for (std::vector::iterator I2 = Accesses[*OI].begin(), I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) return false; if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) return false; } ++OI; } AI++; } } return true; } bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector ValueVector; typedef SmallPtrSet ValueSet; // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; // Holds all the different accesses in the loop. unsigned NumReads = 0; unsigned NumReadWrites = 0; PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); MemoryDepChecker DepChecker(SE, DL, TheLoop); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { // Scan the BB and collect legal loads and stores. for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; ++it) { // If this is a load, save it. If this instruction can read from memory // but is not a load, then we quit. Notice that we don't handle function // calls that read or write. if (it->mayReadFromMemory()) { // Many math library functions read the rounding mode. We will only // vectorize a loop if it contains known function calls that don't set // the flag. Therefore, it is safe to ignore this read from memory. CallInst *Call = dyn_cast(it); if (Call && getIntrinsicIDForCall(Call, TLI)) continue; LoadInst *Ld = dyn_cast(it); if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { emitAnalysis(Report(Ld) << "read with atomic ordering or volatile read"); DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } NumLoads++; Loads.push_back(Ld); DepChecker.addAccess(Ld); continue; } // Save 'store' instructions. Abort if other instructions write to memory. if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast(it); if (!St) { emitAnalysis(Report(it) << "instruction cannot be vectorized"); return false; } if (!St->isSimple() && !IsAnnotatedParallel) { emitAnalysis(Report(St) << "write with atomic ordering or volatile write"); DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } NumStores++; Stores.push_back(St); DepChecker.addAccess(St); } } // Next instr. } // Next block. // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. // Check if we see any stores. If there are no stores, then we don't // care if the pointers are *restrict*. if (!Stores.size()) { DEBUG(dbgs() << "LV: Found a read-only loop!\n"); return true; } AccessAnalysis::DepCandidates DependentAccesses; AccessAnalysis Accesses(DL, AA, DependentAccesses); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once // for read and once for write, it will only appear once (on the write // list). This is okay, since we are going to check for conflicts between // writes and between reads and writes, but not between reads and reads. ValueSet Seen; ValueVector::iterator I, IE; for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { StoreInst *ST = cast(*I); Value* Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) { emitAnalysis( Report(ST) << "write to a loop invariant address could not be vectorized"); DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr).second) { ++NumReadWrites; AliasAnalysis::Location Loc = AA->getLocation(ST); // The TBAA metadata could have a control dependency on the predication // condition, so we cannot rely on it when determining whether or not we // need runtime pointer checks. if (blockNeedsPredication(ST->getParent())) Loc.AATags.TBAA = nullptr; Accesses.addStore(Loc); } } if (IsAnnotatedParallel) { DEBUG(dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"); return true; } for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast(*I); Value* Ptr = LD->getPointerOperand(); // If we did *not* see this pointer before, insert it to the // read list. If we *did* see it before, then it is already in // the read-write list. This allows us to vectorize expressions // such as A[i] += x; Because the address of A[i] is a read-write // pointer. This only works if the index of A[i] is consecutive. // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; if (Seen.insert(Ptr).second || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { ++NumReads; IsReadOnlyPtr = true; } AliasAnalysis::Location Loc = AA->getLocation(LD); // The TBAA metadata could have a control dependency on the predication // condition, so we cannot rely on it when determining whether or not we // need runtime pointer checks. if (blockNeedsPredication(LD->getParent())) Loc.AATags.TBAA = nullptr; Accesses.addLoad(Loc, IsReadOnlyPtr); } // If we write (or read-write) to a single destination and there are no // other reads in this loop then is it safe to vectorize. if (NumReadWrites == 1 && NumReads == 0) { DEBUG(dbgs() << "LV: Found a write-only loop!\n"); return true; } // Build dependence sets and check whether we need a runtime pointer bounds // check. Accesses.buildDependenceSets(); bool NeedRTCheck = Accesses.isRTCheckNeeded(); // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. unsigned NumComparisons = 0; bool CanDoRT = false; if (NeedRTCheck) CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, Strides); DEBUG(dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"); // If we only have one set of dependences to check pointers among we don't // need a runtime check. if (NumComparisons == 0 && NeedRTCheck) NeedRTCheck = false; // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; } if (CanDoRT) { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } if (NeedRTCheck && !CanDoRT) { emitAnalysis(Report() << "cannot identify array bounds"); DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); PtrRtCheck.reset(); return false; } PtrRtCheck.Need = NeedRTCheck; bool CanVecMem = true; if (Accesses.isDependencyCheckNeeded()) { DEBUG(dbgs() << "LV: Checking memory dependencies\n"); CanVecMem = DepChecker.areDepsSafe( DependentAccesses, Accesses.getDependenciesToCheck(), Strides); MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { DEBUG(dbgs() << "LV: Retrying with memory checks\n"); NeedRTCheck = true; // Clear the dependency checks. We assume they are not needed. Accesses.resetDepChecks(); PtrRtCheck.reset(); PtrRtCheck.Need = true; CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, Strides, true); // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { if (!CanDoRT && NumComparisons > 0) emitAnalysis(Report() << "cannot check memory dependencies at runtime"); else emitAnalysis(Report() << NumComparisons << " exceeds limit of " << RuntimeMemoryCheckThreshold << " dependent memory operations checked at runtime"); DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); PtrRtCheck.reset(); return false; } CanVecMem = true; } } if (!CanVecMem) emitAnalysis(Report() << "unsafe dependent memory operations in loop"); DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"); return CanVecMem; } static bool hasMultipleUsesOf(Instruction *I, SmallPtrSetImpl &Insts) { unsigned NumUses = 0; for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { if (Insts.count(dyn_cast(*Use))) ++NumUses; if (NumUses > 1) return true; } return false; } static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl &Set) { for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) if (!Set.count(dyn_cast(*Use))) return false; return true; } bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) return false; // Reduction variables are only found in the loop header block. if (Phi->getParent() != TheLoop->getHeader()) return false; // Obtain the reduction start value from the value that comes from the loop // preheader. Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = nullptr; // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; // We start with the PHI node and scan for all of the users of this // instruction. All users must be instructions that can be used as reduction // variables (such as ADD). We must have a single out-of-block user. The cycle // must include the original PHI. bool FoundStartPHI = false; // To recognize min/max patterns formed by a icmp select sequence, we store // the number of instruction we saw from the recognized min/max pattern, // to make sure we only see exactly the two instructions. unsigned NumCmpSelectPatternInst = 0; ReductionInstDesc ReduxDesc(false, nullptr); SmallPtrSet VisitedInsts; SmallVector Worklist; Worklist.push_back(Phi); VisitedInsts.insert(Phi); // A value in the reduction can be used: // - By the reduction: // - Reduction operation: // - One use of reduction value (safe). // - Multiple use of reduction value (not safe). // - PHI: // - All uses of the PHI must be the reduction (safe). // - Otherwise, not safe. // - By one instruction outside of the loop (safe). // - By further instructions outside of the loop (not safe). // - By an instruction that is not part of the reduction (not safe). // This is either: // * An instruction type other than PHI or the reduction operation. // * A PHI in the header other than the initial PHI. while (!Worklist.empty()) { Instruction *Cur = Worklist.back(); Worklist.pop_back(); // No Users. // If the instruction has no users then this is a broken chain and can't be // a reduction variable. if (Cur->use_empty()) return false; bool IsAPhi = isa(Cur); // A header PHI use other than the original PHI. if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) return false; // Reductions of instructions such as Div, and Sub is only possible if the // LHS is the reduction variable. if (!Cur->isCommutative() && !IsAPhi && !isa(Cur) && !isa(Cur) && !isa(Cur) && !VisitedInsts.count(dyn_cast(Cur->getOperand(0)))) return false; // Any reduction instruction must be of one of the allowed kinds. ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); if (!ReduxDesc.IsReduction) return false; // A reduction operation must only have one use of the reduction value. if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts)) return false; // All inputs to a PHI node must be a reduction value. if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) return false; if (Kind == RK_IntegerMinMax && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; if (Kind == RK_FloatMinMax && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; // Check whether we found a reduction operator. FoundReduxOp |= !IsAPhi; // Process users of current instruction. Push non-PHI nodes after PHI nodes // onto the stack. This way we are going to have seen all inputs to PHI // nodes once we get to them. SmallVector NonPHIs; SmallVector PHIs; for (User *U : Cur->users()) { Instruction *UI = cast(U); // Check if we found the exit user. BasicBlock *Parent = UI->getParent(); if (!TheLoop->contains(Parent)) { // Exit if you find multiple outside users or if the header phi node is // being used. In this case the user uses the value of the previous // iteration, in which case we would loose "VF-1" iterations of the // reduction operation if we vectorize. if (ExitInstruction != nullptr || Cur == Phi) return false; // The instruction used by an outside user must be the last instruction // before we feed back to the reduction phi. Otherwise, we loose VF-1 // operations on the value. if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) return false; ExitInstruction = Cur; continue; } // Process instructions only once (termination). Each reduction cycle // value must only be used once, except by phi nodes and min/max // reductions which are represented as a cmp followed by a select. ReductionInstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI).second) { if (isa(UI)) PHIs.push_back(UI); else NonPHIs.push_back(UI); } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction)) return false; // Remember that we completed the cycle. if (UI == Phi) FoundStartPHI = true; } Worklist.append(PHIs.begin(), PHIs.end()); Worklist.append(NonPHIs.begin(), NonPHIs.end()); } // This means we have seen one but not the other instruction of the // pattern or more than just a select and cmp. if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && NumCmpSelectPatternInst != 2) return false; if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. // This instruction is allowed to have out-of-loop users. AllowedExit.insert(ExitInstruction); // Save the description of this reduction variable. ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, ReduxDesc.MinMaxKind); Reductions[Phi] = RD; // We've ended the cycle. This is a reduction variable if we have an // outside user and it has a binary op. return true; } /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction /// pattern corresponding to a min(X, Y) or max(X, Y). LoopVectorizationLegality::ReductionInstDesc LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) { assert((isa(I) || isa(I) || isa(I)) && "Expect a select instruction"); Instruction *Cmp = nullptr; SelectInst *Select = nullptr; // We must handle the select(cmp()) as a single instruction. Advance to the // select. if ((Cmp = dyn_cast(I)) || (Cmp = dyn_cast(I))) { if (!Cmp->hasOneUse() || !(Select = dyn_cast(*I->user_begin()))) return ReductionInstDesc(false, I); return ReductionInstDesc(Select, Prev.MinMaxKind); } // Only handle single use cases for now. if (!(Select = dyn_cast(I))) return ReductionInstDesc(false, I); if (!(Cmp = dyn_cast(I->getOperand(0))) && !(Cmp = dyn_cast(I->getOperand(0)))) return ReductionInstDesc(false, I); if (!Cmp->hasOneUse()) return ReductionInstDesc(false, I); Value *CmpLeft; Value *CmpRight; // Look for a min/max pattern. if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_UIntMin); else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_UIntMax); else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_SIntMax); else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_SIntMin); else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_FloatMin); else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_FloatMax); else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_FloatMin); else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_FloatMax); return ReductionInstDesc(false, I); } LoopVectorizationLegality::ReductionInstDesc LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind, ReductionInstDesc &Prev) { bool FP = I->getType()->isFloatingPointTy(); bool FastMath = FP && I->hasUnsafeAlgebra(); switch (I->getOpcode()) { default: return ReductionInstDesc(false, I); case Instruction::PHI: if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && Kind != RK_FloatMinMax)) return ReductionInstDesc(false, I); return ReductionInstDesc(I, Prev.MinMaxKind); case Instruction::Sub: case Instruction::Add: return ReductionInstDesc(Kind == RK_IntegerAdd, I); case Instruction::Mul: return ReductionInstDesc(Kind == RK_IntegerMult, I); case Instruction::And: return ReductionInstDesc(Kind == RK_IntegerAnd, I); case Instruction::Or: return ReductionInstDesc(Kind == RK_IntegerOr, I); case Instruction::Xor: return ReductionInstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); case Instruction::FSub: case Instruction::FAdd: return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: if (Kind != RK_IntegerMinMax && (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) return ReductionInstDesc(false, I); return isMinMaxSelectCmpPattern(I, Prev); } } LoopVectorizationLegality::InductionKind LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) return IK_NoInduction; // Check that the PHI is consecutive. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); return IK_NoInduction; } const SCEV *Step = AR->getStepRecurrence(*SE); // Integer inductions need to have a stride of one. if (PhiTy->isIntegerTy()) { if (Step->isOne()) return IK_IntInduction; if (Step->isAllOnesValue()) return IK_ReverseIntInduction; return IK_NoInduction; } // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); if (!C) return IK_NoInduction; assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); Type *PointerElementType = PhiTy->getPointerElementType(); // The pointer stride cannot be determined if the pointer element type is not // sized. if (!PointerElementType->isSized()) return IK_NoInduction; uint64_t Size = DL->getTypeAllocSize(PointerElementType); if (C->getValue()->equalsInt(Size)) return IK_PtrInduction; else if (C->getValue()->equalsInt(0 - Size)) return IK_ReversePtrInduction; return IK_NoInduction; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); if (!PN) return false; return Inductions.count(PN); } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { assert(TheLoop->contains(BB) && "Unknown block used"); // Blocks that do not dominate the latch need predication. BasicBlock* Latch = TheLoop->getLoopLatch(); return !DT->dominates(BB, Latch); } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // Check that we don't have a constant expression that can trap as operand. for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); OI != OE; ++OI) { if (Constant *C = dyn_cast(*OI)) if (C->canTrap()) return false; } // We might be able to hoist the load. if (it->mayReadFromMemory()) { LoadInst *LI = dyn_cast(it); if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { MaskedOp.insert(LI); continue; } return false; } } // We don't predicate stores at the moment. if (it->mayWriteToMemory()) { StoreInst *SI = dyn_cast(it); // We only support predication of stores in basic blocks with one // predecessor. if (!SI) return false; bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || !isSinglePredecessor) { // Build a masked store if it is legal for the target, otherwise scalarize // the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), SI->getPointerOperand()); if (isLegalMaskedOp) { --NumPredStores; MaskedOp.insert(SI); continue; } return false; } } if (it->mayThrow()) return false; // The instructions below can trap. switch (it->getOpcode()) { default: continue; case Instruction::UDiv: case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: return false; } } return true; } LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return Factor; } if (!EnableCondStoresVectorization && Legal->NumPredStores) { emitAnalysis(Report() << "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); if (MaxVectorSize == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; } assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); unsigned VF = MaxVectorSize; // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { // If we are unable to calculate the trip count then don't try to vectorize. if (TC < 2) { emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } // Find the maximum SIMD width that can fit within the trip count. VF = TC % MaxVectorSize; if (VF == 0) VF = MaxVectorSize; // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. if (VF < 2) { emitAnalysis(Report() << "cannot optimize for size and vectorize at the " "same time. Enable vectorization of this loop " "with '#pragma clang loop vectorize(enable)' " "when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } } int UserVF = Hints->getWidth(); if (UserVF != 0) { assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; return Factor; } float Cost = expectedCost(1); #ifndef NDEBUG const float ScalarCost = Cost; #endif /* NDEBUG */ unsigned Width = 1; DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; // Ignore scalar width, because the user explicitly wants vectorization. if (ForceVectorization && VF > 1) { Width = 2; Cost = expectedCost(Width) / (float)Width; } for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. float VectorCost = expectedCost(i) / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (VectorCost < Cost) { Cost = VectorCost; Width = i; } } DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); Factor.Width = Width; Factor.Cost = Width * Cost; return Factor; } unsigned LoopVectorizationCostModel::getWidestType() { unsigned MaxWidth = 8; // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { BasicBlock *BB = *bb; // For each instruction in the loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); // Ignore ephemeral values. if (EphValues.count(it)) continue; // Only examine Loads, Stores and PHINodes. if (!isa(it) && !isa(it) && !isa(it)) continue; // Examine PHI nodes that are reduction variables. if (PHINode *PN = dyn_cast(it)) if (!Legal->getReductionVars()->count(PN)) continue; // Examine the stored values. if (StoreInst *ST = dyn_cast(it)) T = ST->getValueOperand()->getType(); // Ignore loaded pointer types and stored pointer types that are not // consecutive. However, we do want to take consecutive stores/loads of // pointer vectors into account. if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) continue; MaxWidth = std::max(MaxWidth, (unsigned)DL->getTypeSizeInBits(T->getScalarType())); } } return MaxWidth; } unsigned LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost) { // -- The unroll heuristics -- // We unroll the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict // at this level. For example, frontend pressure (on decode or fetch) due to // code size, or the number and capabilities of the execution ports. // // We use the following heuristics to select the unroll factor: // 1. If the code has reductions, then we unroll in order to break the cross // iteration dependency. // 2. If the loop is really small, then we unroll in order to reduce the loop // overhead. // 3. We don't unroll if we think that we will spill registers to memory due // to the increased register pressure. // Use the user preference, unless 'auto' is selected. int UserUF = Hints->getInterleave(); if (UserUF != 0) return UserUF; // When we optimize for size, we don't unroll. if (OptForSize) return 1; // We used the distance for the unroll factor. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; // Do not unroll loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop); if (TC > 1 && TC < TinyTripCountUnrollThreshold) return 1; unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"); if (VF == 1) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumVectorRegs; } LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); R.NumInstructions = std::max(R.NumInstructions, 1U); // We calculate the unroll factor using the following formula. // Subtract the number of loop invariants from the number of available // registers. These registers are used by all of the unrolled instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances // fit without causing spills. All of this is rounded down if necessary to be // a power of two. We want power of two unroll factors to simplify any // addressing operations or alignment considerations. unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers); // Don't count the induction variable as unrolled. if (EnableIndVarRegisterHeur) UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); // Check if the user has overridden the unroll max. if (VF == 1) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; } else { if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; } // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) LoopCost = expectedCost(VF); // Clamp the calculated UF to be between the 1 and the max unroll factor // that the target allows. if (UF > MaxInterleaveSize) UF = MaxInterleaveSize; else if (UF < 1) UF = 1; // Unroll if we vectorized this loop and there is a reduction that could // benefit from unrolling. if (VF > 1 && Legal->getReductionVars()->size()) { DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); return UF; } // Note that if we've already vectorized the loop we will have done the // runtime check and so unrolling won't require further checks. bool UnrollingRequiresRuntimePointerCheck = (VF == 1 && Legal->getRuntimePointerCheck()->Need); // We want to unroll small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); if (!UnrollingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and unroll until the cost of the // loop overhead is about 5% of the cost of the loop. unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); // Unroll until store/load ports (estimated by max unroll factor) are // saturated. unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop // we're unrolling is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast(MaxNestedScalarReductionUF); SmallUF = std::min(SmallUF, F); StoresUF = std::min(StoresUF, F); LoadsUF = std::min(LoadsUF, F); } if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); return std::max(StoresUF, LoadsUF); } DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); return SmallUF; } DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } LoopVectorizationCostModel::RegisterUsage LoopVectorizationCostModel::calculateRegisterUsage() { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and // assign a number to each instruction. We use RPO to ensure that defs are // met before their users. We assume that each instruction that has in-loop // users starts an interval. We record every time that an in-loop value is // used, so we have a list of the first and last occurrences of each // instruction. Next, we transpose this data structure into a multi map that // holds the list of intervals that *end* at a specific location. This multi // map allows us to perform a linear search. We scan the instructions linearly // and record each time that a new interval starts, by placing it in a set. // If we find this value in the multi-map then we remove it from the set. // The max register usage is the maximum size of the set. // We also search for instructions that are defined outside the loop, but are // used inside the loop. We need this number separately from the max-interval // usage number because when we unroll, loop-invariant values do not take // more register. LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); RegisterUsage R; R.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the // instruction that is the key. typedef DenseMap IntervalMap; // Maps instruction to its index. DenseMap IdxToInstr; // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. SmallSet Ends; // Saves the list of values that are used in the loop but are // defined outside the loop, such as arguments and constants. SmallPtrSet LoopInvariants; unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { R.NumInstructions += (*bb)->size(); for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; ++it) { Instruction *I = it; IdxToInstr[Index++] = I; // Save the end location of each USE. for (unsigned i = 0; i < I->getNumOperands(); ++i) { Value *U = I->getOperand(i); Instruction *Instr = dyn_cast(U); // Ignore non-instruction values such as arguments, constants, etc. if (!Instr) continue; // If this instruction is outside the loop then record it and continue. if (!TheLoop->contains(Instr)) { LoopInvariants.insert(Instr); continue; } // Overwrite previous end points. EndPoint[Instr] = Index; Ends.insert(Instr); } } } // Saves the list of intervals that end with the index in 'key'. typedef SmallVector InstrList; DenseMap TransposeEnds; // Transpose the EndPoints to a list of values that end at each index. for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); it != e; ++it) TransposeEnds[it->second].push_back(it->first); SmallSet OpenIntervals; unsigned MaxUsage = 0; DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; // Ignore ephemeral values. if (EphValues.count(I)) continue; // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (unsigned int j=0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); // Count the number of live interals. MaxUsage = std::max(MaxUsage, OpenIntervals.size()); DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } unsigned Invariant = LoopInvariants.size(); DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); R.LoopInvariantRegs = Invariant; R.MaxLocalUsers = MaxUsage; return R; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned Cost = 0; // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { unsigned BlockCost = 0; BasicBlock *BB = *bb; // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // Skip dbg intrinsics. if (isa(it)) continue; // Ignore ephemeral values. if (EphValues.count(it)) continue; unsigned C = getInstructionCost(it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) C = ForceTargetInstructionCost; BlockCost += C; DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'); } // We assume that if-converted blocks have a 50% chance of being executed. // When the code is scalar then some of the blocks are avoided due to CF. // When the code is vectorized we execute all code paths. if (VF == 1 && Legal->blockNeedsPredication(*bb)) BlockCost /= 2; Cost += BlockCost; } return Cost; } /// \brief Check whether the address computation for a non-consecutive memory /// access looks like an unlikely candidate for being merged into the indexing /// mode. /// /// We look for a GEP which has one index that is an induction variable and all /// other indices are loop invariant. If the stride of this access is also /// within a small bound we decide that this address computation can likely be /// merged into the addressing mode. /// In all other cases, we identify the address computation as complex. static bool isLikelyComplexAddressComputation(Value *Ptr, LoopVectorizationLegality *Legal, ScalarEvolution *SE, const Loop *TheLoop) { GetElementPtrInst *Gep = dyn_cast(Ptr); if (!Gep) return true; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. unsigned NumOperands = Gep->getNumOperands(); for (unsigned i = 1; i < NumOperands; ++i) { Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) return true; } // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step // can likely be merged into the address computation. unsigned MaxMergeDistance = 64; const SCEVAddRecExpr *AddRec = dyn_cast(SE->getSCEV(Ptr)); if (!AddRec) return true; // Check the step is constant. const SCEV *Step = AddRec->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); if (!C) return true; const APInt &APStepVal = C->getValue()->getValue(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) return true; int64_t StepVal = APStepVal.getSExtValue(); return StepVal > MaxMergeDistance; } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) return true; return false; } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (Legal->isUniformAfterVectorization(I)) VF = 1; Type *RetTy = I->getType(); Type *VectorTy = ToVectorTy(RetTy, VF); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: // We mark this instruction as zero-cost because the cost of GEPs in // vectorized code depends on whether the corresponding memory instruction // is scalarized or not. Therefore, we handle GEPs with the memory // instruction cost. return 0; case Instruction::Br: { return TTI.getCFInstrCost(I->getOpcode()); } case Instruction::PHI: //TODO: IF-converted IFs become selects. return 0; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { // Since we will replace the stride by 1 the multiplication should go away. if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) return 0; // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); // Check for a splat of a constant or for a non uniform vector of constants. if (isa(Op2)) { ConstantInt *CInt = cast(Op2); if (CInt && CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } else if (isa(Op2) || isa(Op2)) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; Constant *SplatValue = cast(Op2)->getSplatValue(); if (SplatValue) { ConstantInt *CInt = dyn_cast(SplatValue); if (CInt && CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, Op1VP, Op2VP); } case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: case Instruction::Load: { StoreInst *SI = dyn_cast(I); LoadInst *LI = dyn_cast(I); Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); VectorTy = ToVectorTy(ValTy, VF); unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); unsigned AS = SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); // We add the cost of address computation here instead of with the gep // instruction because only here we know whether the operation is // scalarized. if (VF == 1) return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); for (unsigned i = 0; i < VF; ++i) { // The cost of extracting the pointer operand. Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); // In case of STORE, the cost of ExtractElement from the vector. // In case of LOAD, the cost of InsertElement into the returned // vector. Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : Instruction::InsertElement, VectorTy, i); } // The cost of the scalar loads/stores. Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost; } // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { // We optimize the truncation of induction variable. // The cost of these is the same as the scalar operation. if (I->getOpcode() == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) return TTI.getCastInstrCost(I->getOpcode(), I->getType(), I->getOperand(0)->getType()); Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { CallInst *CI = cast(I); Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); assert(ID && "Not an intrinsic call!"); Type *RetTy = ToVectorTy(CI->getType(), VF); SmallVector Tys; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } default: { // We are scalarizing the instruction. Return the cost of the scalar // instruction, plus the cost of insert and extract into vector // elements, times the vector width. unsigned Cost = 0; if (!RetTy->isVoidTy() && VF != 1) { unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy); unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy); // The cost of inserting the results plus extracting each one of the // operands. Cost += VF * (InsCost + ExtCost * I->getNumOperands()); } // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); return Cost; } }// end of switch. } Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { if (Scalar->isVoidTy() || VF == 1) return Scalar; return VectorType::get(Scalar, VF); } char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { return new LoopVectorize(NoUnrolling, AlwaysVectorize); } } bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { // Check for a store. if (StoreInst *ST = dyn_cast(Inst)) return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; // Check for a load. if (LoadInst *LI = dyn_cast(Inst)) return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; return false; } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; setDebugLocFromInst(Builder, Instr); // Find all of the vectorized parameters. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *SrcOp = Instr->getOperand(op); // If we are accessing the old induction variable, use the new one. if (SrcOp == OldInduction) { Params.push_back(getVectorValue(SrcOp)); continue; } // Try using previously calculated values. Instruction *SrcInst = dyn_cast(SrcOp); // If the src is an instruction that appeared earlier in the basic block // then it should already be vectorized. if (SrcInst && OrigLoop->contains(SrcInst)) { assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); // The parameter is a vector value from earlier. Params.push_back(WidenMap.get(SrcInst)); } else { // The parameter is a scalar from outside the loop. Maybe even a constant. VectorParts Scalars; Scalars.append(UF, SrcOp); Params.push_back(Scalars); } } assert(Params.size() == Instr->getNumOperands() && "Invalid number of operands"); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); Instruction *InsertPt = Builder.GetInsertPoint(); BasicBlock *IfBlock = Builder.GetInsertBlock(); BasicBlock *CondBlock = nullptr; VectorParts Cond; Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); VectorLp = LI->getLoopFor(IfBlock); assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: // Start an "if (pred) a[i] = ..." block. Value *Cmp = nullptr; if (IfPredicateStore) { if (Cond[Part]->getType()->isVectorTy()) Cond[Part] = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], ConstantInt::get(Cond[Part]->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); // Replace the operands of the cloned instructions with extracted scalars. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *Op = Params[op][Part]; Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) VecResults[Part] = Cloned; // End if-block. if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); OldBr->eraseFromParent(); IfBlock = NewIfBlock; } } } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { StoreInst *SI = dyn_cast(Instr); bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); return scalarizeInstruction(Instr, IfPredicateStore); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, bool Negate) { // When unrolling and the VF is 1, we only need to add a simple scalar. Type *ITy = Val->getType(); assert(!ITy->isVectorTy() && "Val must be a scalar"); Constant *C = ConstantInt::get(ITy, StartIdx, Negate); return Builder.CreateAdd(Val, C, "induction"); } Index: projects/clang360-import/contrib/llvm/patches/README.TXT =================================================================== --- projects/clang360-import/contrib/llvm/patches/README.TXT (revision 279024) +++ projects/clang360-import/contrib/llvm/patches/README.TXT (revision 279025) @@ -1,16 +1,16 @@ This is a set of individual patches, which contain all the customizations to llvm/clang currently in the FreeBSD base system. These can be applied in -alphabetical order to a pristine llvm/clang 3.6.0 RC3 source tree, for example +alphabetical order to a pristine llvm/clang 3.6.0 RC4 source tree, for example by doing: -svn co https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc3 llvm-3.6.0-rc3 -svn co https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_360/rc3 llvm-3.6.0-rc3/tools/clang -cd llvm-3.6.0-rc3 +svn co https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc4 llvm-3.6.0-rc4 +svn co https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_360/rc4 llvm-3.6.0-rc4/tools/clang +cd llvm-3.6.0-rc4 for p in /usr/src/contrib/llvm/patches/patch-*.diff; do patch -p0 -f -F0 -E -i $p -s || break done A number of these consist of hand-written modifications, specifically for FreeBSD, while most others are cherry pickings off the llvm and clang trunks. When a new version of llvm/clang is eventually imported, those latter ones will largely disappear. Index: projects/clang360-import/contrib/llvm/patches/patch-01-freebsd-kprintf.diff =================================================================== --- projects/clang360-import/contrib/llvm/patches/patch-01-freebsd-kprintf.diff (revision 279024) +++ projects/clang360-import/contrib/llvm/patches/patch-01-freebsd-kprintf.diff (revision 279025) @@ -1,382 +1,381 @@ This patch adds support for the FreeBSD kernel specific printf format specifiers: %b, %D, %r and %y, via a new __freebsd_kprintf__ format string type. Sent upstream as http://reviews.llvm.org/D7154 Index: tools/clang/include/clang/Analysis/Analyses/FormatString.h =================================================================== --- tools/clang/include/clang/Analysis/Analyses/FormatString.h +++ tools/clang/include/clang/Analysis/Analyses/FormatString.h @@ -161,6 +161,12 @@ class ConversionSpecifier { ObjCObjArg, // '@' ObjCBeg = ObjCObjArg, ObjCEnd = ObjCObjArg, + // FreeBSD kernel specific specifiers. + FreeBSDbArg, + FreeBSDDArg, + FreeBSDrArg, + FreeBSDyArg, + // GlibC specific specifiers. PrintErrno, // 'm' -@@ -203,8 +209,8 @@ class ConversionSpecifier { - unsigned getLength() const { +@@ -204,7 +210,8 @@ class ConversionSpecifier { return EndScanList ? EndScanList - Position : 1; } -- + - bool isIntArg() const { return kind >= IntArgBeg && kind <= IntArgEnd; } + bool isIntArg() const { return (kind >= IntArgBeg && kind <= IntArgEnd) || + kind == FreeBSDrArg || kind == FreeBSDyArg; } bool isUIntArg() const { return kind >= UIntArgBeg && kind <= UIntArgEnd; } bool isAnyIntArg() const { return kind >= IntArgBeg && kind <= UIntArgEnd; } const char *toString() const; -@@ -646,7 +652,7 @@ class FormatStringHandler { +@@ -646,7 +653,7 @@ class FormatStringHandler { bool ParsePrintfString(FormatStringHandler &H, const char *beg, const char *end, const LangOptions &LO, - const TargetInfo &Target); + const TargetInfo &Target, bool isFreeBSDKPrintf); bool ParseFormatStringHasSArg(const char *beg, const char *end, const LangOptions &LO, const TargetInfo &Target); Index: tools/clang/include/clang/Sema/Sema.h =================================================================== --- tools/clang/include/clang/Sema/Sema.h +++ tools/clang/include/clang/Sema/Sema.h -@@ -8572,6 +8572,7 @@ class Sema { +@@ -8566,6 +8566,7 @@ class Sema { FST_Strftime, FST_Strfmon, FST_Kprintf, + FST_FreeBSDKPrintf, FST_Unknown }; static FormatStringType GetFormatStringType(const FormatAttr *Format); Index: tools/clang/lib/Analysis/FormatString.cpp =================================================================== --- tools/clang/lib/Analysis/FormatString.cpp +++ tools/clang/lib/Analysis/FormatString.cpp @@ -552,6 +552,12 @@ const char *ConversionSpecifier::toString() const // Objective-C specific specifiers. case ObjCObjArg: return "@"; + // FreeBSD kernel specific specifiers. + case FreeBSDbArg: return "b"; + case FreeBSDDArg: return "D"; + case FreeBSDrArg: return "r"; + case FreeBSDyArg: return "y"; + // GlibC specific specifiers. case PrintErrno: return "m"; @@ -647,6 +653,9 @@ bool FormatSpecifier::hasValidLengthModifier(const case ConversionSpecifier::XArg: case ConversionSpecifier::nArg: return true; + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: + return Target.getTriple().isOSFreeBSD(); default: return false; } @@ -677,6 +686,9 @@ bool FormatSpecifier::hasValidLengthModifier(const case ConversionSpecifier::ScanListArg: case ConversionSpecifier::ZArg: return true; + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: + return Target.getTriple().isOSFreeBSD(); default: return false; } @@ -807,6 +819,10 @@ bool FormatSpecifier::hasStandardConversionSpecifi case ConversionSpecifier::SArg: return LangOpt.ObjC1 || LangOpt.ObjC2; case ConversionSpecifier::InvalidSpecifier: + case ConversionSpecifier::FreeBSDbArg: + case ConversionSpecifier::FreeBSDDArg: + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: case ConversionSpecifier::PrintErrno: case ConversionSpecifier::DArg: case ConversionSpecifier::OArg: Index: tools/clang/lib/Analysis/PrintfFormatString.cpp =================================================================== --- tools/clang/lib/Analysis/PrintfFormatString.cpp +++ tools/clang/lib/Analysis/PrintfFormatString.cpp @@ -55,7 +55,8 @@ static PrintfSpecifierResult ParsePrintfSpecifier( unsigned &argIndex, const LangOptions &LO, const TargetInfo &Target, - bool Warn) { + bool Warn, + bool isFreeBSDKPrintf) { using namespace clang::analyze_format_string; using namespace clang::analyze_printf; @@ -206,9 +207,24 @@ static PrintfSpecifierResult ParsePrintfSpecifier( case '@': k = ConversionSpecifier::ObjCObjArg; break; // Glibc specific. case 'm': k = ConversionSpecifier::PrintErrno; break; + // FreeBSD kernel specific. + case 'b': + if (isFreeBSDKPrintf) + k = ConversionSpecifier::FreeBSDbArg; // int followed by char * + break; + case 'r': + if (isFreeBSDKPrintf) + k = ConversionSpecifier::FreeBSDrArg; // int + break; + case 'y': + if (isFreeBSDKPrintf) + k = ConversionSpecifier::FreeBSDyArg; // int + break; // Apple-specific. case 'D': - if (Target.getTriple().isOSDarwin()) + if (isFreeBSDKPrintf) + k = ConversionSpecifier::FreeBSDDArg; // void * followed by char * + else if (Target.getTriple().isOSDarwin()) k = ConversionSpecifier::DArg; break; case 'O': @@ -228,6 +244,10 @@ static PrintfSpecifierResult ParsePrintfSpecifier( FS.setConversionSpecifier(CS); if (CS.consumesDataArgument() && !FS.usesPositionalArg()) FS.setArgIndex(argIndex++); + // FreeBSD kernel specific. + if (k == ConversionSpecifier::FreeBSDbArg || + k == ConversionSpecifier::FreeBSDDArg) + argIndex++; if (k == ConversionSpecifier::InvalidSpecifier) { // Assume the conversion takes one argument. @@ -240,7 +260,8 @@ bool clang::analyze_format_string::ParsePrintfStri const char *I, const char *E, const LangOptions &LO, - const TargetInfo &Target) { + const TargetInfo &Target, + bool isFreeBSDKPrintf) { unsigned argIndex = 0; @@ -247,7 +268,8 @@ bool clang::analyze_format_string::ParsePrintfStri // Keep looking for a format specifier until we have exhausted the string. while (I != E) { const PrintfSpecifierResult &FSR = ParsePrintfSpecifier(H, I, E, argIndex, - LO, Target, true); + LO, Target, true, + isFreeBSDKPrintf); // Did a fail-stop error of any kind occur when parsing the specifier? // If so, don't do any more processing. if (FSR.shouldStop()) @@ -276,7 +298,8 @@ bool clang::analyze_format_string::ParseFormatStri FormatStringHandler H; while (I != E) { const PrintfSpecifierResult &FSR = ParsePrintfSpecifier(H, I, E, argIndex, - LO, Target, false); + LO, Target, false, + false); // Did a fail-stop error of any kind occur when parsing the specifier? // If so, don't do any more processing. if (FSR.shouldStop()) @@ -674,6 +697,8 @@ bool PrintfSpecifier::hasValidPlusPrefix() const { case ConversionSpecifier::GArg: case ConversionSpecifier::aArg: case ConversionSpecifier::AArg: + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: return true; default: @@ -699,6 +724,8 @@ bool PrintfSpecifier::hasValidAlternativeForm() co case ConversionSpecifier::FArg: case ConversionSpecifier::gArg: case ConversionSpecifier::GArg: + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: return true; default: @@ -729,6 +756,8 @@ bool PrintfSpecifier::hasValidLeadingZeros() const case ConversionSpecifier::FArg: case ConversionSpecifier::gArg: case ConversionSpecifier::GArg: + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: return true; default: @@ -753,6 +782,8 @@ bool PrintfSpecifier::hasValidSpacePrefix() const case ConversionSpecifier::GArg: case ConversionSpecifier::aArg: case ConversionSpecifier::AArg: + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: return true; default: @@ -818,6 +849,8 @@ bool PrintfSpecifier::hasValidPrecision() const { case ConversionSpecifier::gArg: case ConversionSpecifier::GArg: case ConversionSpecifier::sArg: + case ConversionSpecifier::FreeBSDrArg: + case ConversionSpecifier::FreeBSDyArg: return true; default: Index: tools/clang/lib/Sema/SemaChecking.cpp =================================================================== --- tools/clang/lib/Sema/SemaChecking.cpp +++ tools/clang/lib/Sema/SemaChecking.cpp -@@ -2579,6 +2579,7 @@ Sema::FormatStringType Sema::GetFormatStringType(c +@@ -2584,6 +2584,7 @@ Sema::FormatStringType Sema::GetFormatStringType(c .Case("strftime", FST_Strftime) .Case("strfmon", FST_Strfmon) .Cases("kprintf", "cmn_err", "vcmn_err", "zcmn_err", FST_Kprintf) + .Case("freebsd_kprintf", FST_FreeBSDKPrintf) .Default(FST_Unknown); } -@@ -3360,6 +3361,43 @@ CheckPrintfHandler::HandlePrintfSpecifier(const an +@@ -3365,6 +3366,43 @@ CheckPrintfHandler::HandlePrintfSpecifier(const an CoveredArgs.set(argIndex); } + // FreeBSD kernel extensions. + if (CS.getKind() == ConversionSpecifier::FreeBSDbArg || + CS.getKind() == ConversionSpecifier::FreeBSDDArg) { + // We need at least two arguments. + if (!CheckNumArgs(FS, CS, startSpecifier, specifierLen, argIndex + 1)) + return false; + + // Claim the second argument. + CoveredArgs.set(argIndex + 1); + + // Type check the first argument (int for %b, pointer for %D) + const Expr *Ex = getDataArg(argIndex); + const analyze_printf::ArgType &AT = + (CS.getKind() == ConversionSpecifier::FreeBSDbArg) ? + ArgType(S.Context.IntTy) : ArgType::CPointerTy; + if (AT.isValid() && !AT.matchesType(S.Context, Ex->getType())) + EmitFormatDiagnostic( + S.PDiag(diag::warn_format_conversion_argument_type_mismatch) + << AT.getRepresentativeTypeName(S.Context) << Ex->getType() + << false << Ex->getSourceRange(), + Ex->getLocStart(), /*IsStringLocation*/false, + getSpecifierRange(startSpecifier, specifierLen)); + + // Type check the second argument (char * for both %b and %D) + Ex = getDataArg(argIndex + 1); + const analyze_printf::ArgType &AT2 = ArgType::CStrTy; + if (AT2.isValid() && !AT2.matchesType(S.Context, Ex->getType())) + EmitFormatDiagnostic( + S.PDiag(diag::warn_format_conversion_argument_type_mismatch) + << AT2.getRepresentativeTypeName(S.Context) << Ex->getType() + << false << Ex->getSourceRange(), + Ex->getLocStart(), /*IsStringLocation*/false, + getSpecifierRange(startSpecifier, specifierLen)); + + return true; + } + // Check for using an Objective-C specific conversion specifier // in a non-ObjC literal. if (!ObjCContext && CS.isObjCArg()) { -@@ -3983,7 +4021,8 @@ void Sema::CheckFormatString(const StringLiteral * +@@ -3988,7 +4026,8 @@ void Sema::CheckFormatString(const StringLiteral * return; } - if (Type == FST_Printf || Type == FST_NSString) { + if (Type == FST_Printf || Type == FST_NSString || + Type == FST_FreeBSDKPrintf) { CheckPrintfHandler H(*this, FExpr, OrigFormatExpr, firstDataArg, numDataArgs, (Type == FST_NSString), Str, HasVAListArg, Args, format_idx, -@@ -3991,7 +4030,8 @@ void Sema::CheckFormatString(const StringLiteral * +@@ -3996,7 +4035,8 @@ void Sema::CheckFormatString(const StringLiteral * if (!analyze_format_string::ParsePrintfString(H, Str, Str + StrLen, getLangOpts(), - Context.getTargetInfo())) + Context.getTargetInfo(), + Type == FST_FreeBSDKPrintf)) H.DoneProcessing(); } else if (Type == FST_Scanf) { CheckScanfHandler H(*this, FExpr, OrigFormatExpr, firstDataArg, numDataArgs, Index: tools/clang/lib/Sema/SemaDeclAttr.cpp =================================================================== --- tools/clang/lib/Sema/SemaDeclAttr.cpp +++ tools/clang/lib/Sema/SemaDeclAttr.cpp @@ -2481,6 +2481,7 @@ static FormatAttrKind getFormatAttrKind(StringRef .Cases("scanf", "printf", "printf0", "strfmon", SupportedFormat) .Cases("cmn_err", "vcmn_err", "zcmn_err", SupportedFormat) .Case("kprintf", SupportedFormat) // OpenBSD. + .Case("freebsd_kprintf", SupportedFormat) // FreeBSD. .Cases("gcc_diag", "gcc_cdiag", "gcc_cxxdiag", "gcc_tdiag", IgnoredFormat) .Default(InvalidFormat); Index: tools/clang/test/Sema/attr-format.c =================================================================== --- tools/clang/test/Sema/attr-format.c +++ tools/clang/test/Sema/attr-format.c @@ -57,8 +57,15 @@ void callnull(void){ null(0, (int*)0); // expected-warning {{incompatible pointer types}} } +// FreeBSD kernel extensions +void a3(const char *a, ...) __attribute__((format(freebsd_kprintf, 1,2))); // no-error +void b3(const char *a, ...) __attribute__((format(freebsd_kprintf, 1,1))); // expected-error {{'format' attribute parameter 3 is out of bounds}} +void c3(const char *a, ...) __attribute__((format(freebsd_kprintf, 0,2))); // expected-error {{'format' attribute parameter 2 is out of bounds}} +void d3(const char *a, int c) __attribute__((format(freebsd_kprintf, 1,2))); // expected-error {{format attribute requires variadic function}} +void e3(char *str, int c, ...) __attribute__((format(freebsd_kprintf, 2,3))); // expected-error {{format argument not a string type}} + // PR4470 int xx_vprintf(const char *, va_list); Index: tools/clang/test/Sema/format-strings-freebsd.c =================================================================== --- tools/clang/test/Sema/format-strings-freebsd.c +++ tools/clang/test/Sema/format-strings-freebsd.c @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -triple i386-unknown-freebsd %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-freebsd %s + +// Test FreeBSD kernel printf extensions. +int freebsd_kernel_printf(const char *, ...) __attribute__((__format__(__freebsd_kprintf__, 1, 2))); + +void check_freebsd_kernel_extensions(int i, long l, char *s) +{ + // %b expects an int and a char * + freebsd_kernel_printf("reg=%b\n", i, "\10\2BITTWO\1BITONE\n"); // no-warning + freebsd_kernel_printf("reg=%b\n", l, "\10\2BITTWO\1BITONE\n"); // expected-warning{{format specifies type 'int' but the argument has type 'long'}} + freebsd_kernel_printf("reg=%b\n", i, l); // expected-warning{{format specifies type 'char *' but the argument has type 'long'}} + freebsd_kernel_printf("reg=%b\n", i); // expected-warning{{more '%' conversions than data arguments}} + freebsd_kernel_printf("reg=%b\n", i, "\10\2BITTWO\1BITONE\n", l); // expected-warning{{data argument not used by format string}} + + // %D expects an unsigned char * and a char * + freebsd_kernel_printf("%6D", s, ":"); // no-warning + freebsd_kernel_printf("%6D", i, ":"); // expected-warning{{format specifies type 'void *' but the argument has type 'int'}} + freebsd_kernel_printf("%6D", s, i); // expected-warning{{format specifies type 'char *' but the argument has type 'int'}} + freebsd_kernel_printf("%6D", s); // expected-warning{{more '%' conversions than data arguments}} + freebsd_kernel_printf("%6D", s, ":", i); // expected-warning{{data argument not used by format string}} + + freebsd_kernel_printf("%*D", 42, s, ":"); // no-warning + freebsd_kernel_printf("%*D", 42, i, ":"); // expected-warning{{format specifies type 'void *' but the argument has type 'int'}} + freebsd_kernel_printf("%*D", 42, s, i); // expected-warning{{format specifies type 'char *' but the argument has type 'int'}} + freebsd_kernel_printf("%*D", 42, s); // expected-warning{{more '%' conversions than data arguments}} + freebsd_kernel_printf("%*D", 42, s, ":", i); // expected-warning{{data argument not used by format string}} + + // %r expects an int + freebsd_kernel_printf("%r", i); // no-warning + freebsd_kernel_printf("%r", l); // expected-warning{{format specifies type 'int' but the argument has type 'long'}} + freebsd_kernel_printf("%lr", i); // expected-warning{{format specifies type 'long' but the argument has type 'int'}} + freebsd_kernel_printf("%lr", l); // no-warning + + // %y expects an int + freebsd_kernel_printf("%y", i); // no-warning + freebsd_kernel_printf("%y", l); // expected-warning{{format specifies type 'int' but the argument has type 'long'}} + freebsd_kernel_printf("%ly", i); // expected-warning{{format specifies type 'long' but the argument has type 'int'}} + freebsd_kernel_printf("%ly", l); // no-warning +} Index: projects/clang360-import/contrib/llvm/patches/patch-05-enable-armv6-clrex.diff =================================================================== --- projects/clang360-import/contrib/llvm/patches/patch-05-enable-armv6-clrex.diff (revision 279024) +++ projects/clang360-import/contrib/llvm/patches/patch-05-enable-armv6-clrex.diff (revision 279025) @@ -1,20 +1,20 @@ For now, enable the clrex instruction for armv6, until upstream implements this properly. Submitted by: rdivacky Introduced here: http://svnweb.freebsd.org/changeset/base/275362 Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== ---- lib/Target/ARM/ARMInstrInfo.td (revision 5) -+++ lib/Target/ARM/ARMInstrInfo.td (revision 6) +--- lib/Target/ARM/ARMInstrInfo.td ++++ lib/Target/ARM/ARMInstrInfo.td @@ -4640,7 +4640,7 @@ def STLEXD : AIstlex<0b01, (outs GPR:$Rd), def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", [(int_arm_clrex)]>, - Requires<[IsARM, HasV7]> { + Requires<[IsARM, HasV6]> { let Inst{31-0} = 0b11110101011111111111000000011111; } Index: projects/clang360-import/contrib/llvm/patches/patch-06-clang-add-mips-triples.diff =================================================================== --- projects/clang360-import/contrib/llvm/patches/patch-06-clang-add-mips-triples.diff (revision 279024) +++ projects/clang360-import/contrib/llvm/patches/patch-06-clang-add-mips-triples.diff (revision 279025) @@ -1,33 +1,33 @@ Allow clang to be built for mips/mips64 backend types by adding our mips triple ids This only allows testing and does not change the defaults for mips/mips64. They still build/use gcc by default. Differential Revision: https://reviews.freebsd.org/D1190 Reviewed by: dim Introduced here: http://svnweb.freebsd.org/changeset/base/277423 Index: tools/clang/lib/Driver/Tools.cpp =================================================================== --- tools/clang/lib/Driver/Tools.cpp +++ tools/clang/lib/Driver/Tools.cpp -@@ -6651,6 +6651,17 @@ void freebsd::Link::ConstructJob(Compilation &C, c +@@ -6652,6 +6652,17 @@ void freebsd::Link::ConstructJob(Compilation &C, c CmdArgs.push_back("elf32ppc_fbsd"); } + if (Arg *A = Args.getLastArg(options::OPT_G)) { + if (ToolChain.getArch() == llvm::Triple::mips || + ToolChain.getArch() == llvm::Triple::mipsel || + ToolChain.getArch() == llvm::Triple::mips64 || + ToolChain.getArch() == llvm::Triple::mips64el) { + StringRef v = A->getValue(); + CmdArgs.push_back(Args.MakeArgString("-G" + v)); + A->claim(); + } + } + if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); Index: projects/clang360-import/contrib/llvm/tools/clang/lib/Basic/Version.cpp =================================================================== --- projects/clang360-import/contrib/llvm/tools/clang/lib/Basic/Version.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/tools/clang/lib/Basic/Version.cpp (revision 279025) @@ -1,153 +1,153 @@ //===- Version.cpp - Clang Version Number -----------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines several version-related utility functions for Clang. // //===----------------------------------------------------------------------===// #include "clang/Basic/Version.h" #include "clang/Basic/LLVM.h" #include "clang/Config/config.h" #include "llvm/Support/raw_ostream.h" #include #include #ifdef HAVE_SVN_VERSION_INC # include "SVNVersion.inc" #endif namespace clang { std::string getClangRepositoryPath() { #if defined(CLANG_REPOSITORY_STRING) return CLANG_REPOSITORY_STRING; #else #ifdef SVN_REPOSITORY StringRef URL(SVN_REPOSITORY); #else StringRef URL(""); #endif // If the SVN_REPOSITORY is empty, try to use the SVN keyword. This helps us // pick up a tag in an SVN export, for example. - StringRef SVNRepository("$URL: https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_360/rc3/lib/Basic/Version.cpp $"); + StringRef SVNRepository("$URL: https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_360/rc4/lib/Basic/Version.cpp $"); if (URL.empty()) { URL = SVNRepository.slice(SVNRepository.find(':'), SVNRepository.find("/lib/Basic")); } // Strip off version from a build from an integration branch. URL = URL.slice(0, URL.find("/src/tools/clang")); // Trim path prefix off, assuming path came from standard cfe path. size_t Start = URL.find("cfe/"); if (Start != StringRef::npos) URL = URL.substr(Start + 4); return URL; #endif } std::string getLLVMRepositoryPath() { #ifdef LLVM_REPOSITORY StringRef URL(LLVM_REPOSITORY); #else StringRef URL(""); #endif // Trim path prefix off, assuming path came from standard llvm path. // Leave "llvm/" prefix to distinguish the following llvm revision from the // clang revision. size_t Start = URL.find("llvm/"); if (Start != StringRef::npos) URL = URL.substr(Start); return URL; } std::string getClangRevision() { #ifdef SVN_REVISION return SVN_REVISION; #else return ""; #endif } std::string getLLVMRevision() { #ifdef LLVM_REVISION return LLVM_REVISION; #else return ""; #endif } std::string getClangFullRepositoryVersion() { std::string buf; llvm::raw_string_ostream OS(buf); std::string Path = getClangRepositoryPath(); std::string Revision = getClangRevision(); if (!Path.empty() || !Revision.empty()) { OS << '('; if (!Path.empty()) OS << Path; if (!Revision.empty()) { if (!Path.empty()) OS << ' '; OS << Revision; } OS << ')'; } // Support LLVM in a separate repository. std::string LLVMRev = getLLVMRevision(); if (!LLVMRev.empty() && LLVMRev != Revision) { OS << " ("; std::string LLVMRepo = getLLVMRepositoryPath(); if (!LLVMRepo.empty()) OS << LLVMRepo << ' '; OS << LLVMRev << ')'; } return OS.str(); } std::string getClangFullVersion() { return getClangToolFullVersion("clang"); } std::string getClangToolFullVersion(StringRef ToolName) { std::string buf; llvm::raw_string_ostream OS(buf); #ifdef CLANG_VENDOR OS << CLANG_VENDOR; #endif OS << ToolName << " version " CLANG_VERSION_STRING " " << getClangFullRepositoryVersion(); #ifdef CLANG_VENDOR_SUFFIX OS << CLANG_VENDOR_SUFFIX; #elif defined(CLANG_VENDOR) // If vendor supplied, include the base LLVM version as well. OS << " (based on " << BACKEND_PACKAGE_STRING << ")"; #endif return OS.str(); } std::string getClangFullCPPVersion() { // The version string we report in __VERSION__ is just a compacted version of // the one we report on the command line. std::string buf; llvm::raw_string_ostream OS(buf); #ifdef CLANG_VENDOR OS << CLANG_VENDOR; #endif OS << "Clang " CLANG_VERSION_STRING " " << getClangFullRepositoryVersion(); return OS.str(); } } // end namespace clang Index: projects/clang360-import/contrib/llvm/tools/clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- projects/clang360-import/contrib/llvm/tools/clang/lib/CodeGen/TargetInfo.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/tools/clang/lib/CodeGen/TargetInfo.cpp (revision 279025) @@ -1,7211 +1,7207 @@ //===---- TargetInfo.cpp - Encapsulate target details -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // These classes wrap the information about a call or function // definition used to handle ABI compliancy. // //===----------------------------------------------------------------------===// #include "TargetInfo.h" #include "ABIInfo.h" #include "CGCXXABI.h" #include "CGValue.h" #include "CodeGenFunction.h" #include "clang/AST/RecordLayout.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Frontend/CodeGenOptions.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Type.h" #include "llvm/Support/raw_ostream.h" #include // std::sort using namespace clang; using namespace CodeGen; static void AssignToArrayRange(CodeGen::CGBuilderTy &Builder, llvm::Value *Array, llvm::Value *Value, unsigned FirstIndex, unsigned LastIndex) { // Alternatively, we could emit this as a loop in the source. for (unsigned I = FirstIndex; I <= LastIndex; ++I) { llvm::Value *Cell = Builder.CreateConstInBoundsGEP1_32(Array, I); Builder.CreateStore(Value, Cell); } } static bool isAggregateTypeForABI(QualType T) { return !CodeGenFunction::hasScalarEvaluationKind(T) || T->isMemberFunctionPointerType(); } ABIInfo::~ABIInfo() {} static CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI) { const CXXRecordDecl *RD = dyn_cast(RT->getDecl()); if (!RD) return CGCXXABI::RAA_Default; return CXXABI.getRecordArgABI(RD); } static CGCXXABI::RecordArgABI getRecordArgABI(QualType T, CGCXXABI &CXXABI) { const RecordType *RT = T->getAs(); if (!RT) return CGCXXABI::RAA_Default; return getRecordArgABI(RT, CXXABI); } /// Pass transparent unions as if they were the type of the first element. Sema /// should ensure that all elements of the union have the same "machine type". static QualType useFirstFieldIfTransparentUnion(QualType Ty) { if (const RecordType *UT = Ty->getAsUnionType()) { const RecordDecl *UD = UT->getDecl(); if (UD->hasAttr()) { assert(!UD->field_empty() && "sema created an empty transparent union"); return UD->field_begin()->getType(); } } return Ty; } CGCXXABI &ABIInfo::getCXXABI() const { return CGT.getCXXABI(); } ASTContext &ABIInfo::getContext() const { return CGT.getContext(); } llvm::LLVMContext &ABIInfo::getVMContext() const { return CGT.getLLVMContext(); } const llvm::DataLayout &ABIInfo::getDataLayout() const { return CGT.getDataLayout(); } const TargetInfo &ABIInfo::getTarget() const { return CGT.getTarget(); } bool ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { return false; } bool ABIInfo::isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const { return false; } void ABIArgInfo::dump() const { raw_ostream &OS = llvm::errs(); OS << "(ABIArgInfo Kind="; switch (TheKind) { case Direct: OS << "Direct Type="; if (llvm::Type *Ty = getCoerceToType()) Ty->print(OS); else OS << "null"; break; case Extend: OS << "Extend"; break; case Ignore: OS << "Ignore"; break; case InAlloca: OS << "InAlloca Offset=" << getInAllocaFieldIndex(); break; case Indirect: OS << "Indirect Align=" << getIndirectAlign() << " ByVal=" << getIndirectByVal() << " Realign=" << getIndirectRealign(); break; case Expand: OS << "Expand"; break; } OS << ")\n"; } TargetCodeGenInfo::~TargetCodeGenInfo() { delete Info; } // If someone can figure out a general rule for this, that would be great. // It's probably just doomed to be platform-dependent, though. unsigned TargetCodeGenInfo::getSizeOfUnwindException() const { // Verified for: // x86-64 FreeBSD, Linux, Darwin // x86-32 FreeBSD, Linux, Darwin // PowerPC Linux, Darwin // ARM Darwin (*not* EABI) // AArch64 Linux return 32; } bool TargetCodeGenInfo::isNoProtoCallVariadic(const CallArgList &args, const FunctionNoProtoType *fnType) const { // The following conventions are known to require this to be false: // x86_stdcall // MIPS // For everything else, we just prefer false unless we opt out. return false; } void TargetCodeGenInfo::getDependentLibraryOption(llvm::StringRef Lib, llvm::SmallString<24> &Opt) const { // This assumes the user is passing a library name like "rt" instead of a // filename like "librt.a/so", and that they don't care whether it's static or // dynamic. Opt = "-l"; Opt += Lib; } static bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays); /// isEmptyField - Return true iff a the field is "empty", that is it /// is an unnamed bit-field or an (array of) empty record(s). static bool isEmptyField(ASTContext &Context, const FieldDecl *FD, bool AllowArrays) { if (FD->isUnnamedBitfield()) return true; QualType FT = FD->getType(); // Constant arrays of empty records count as empty, strip them off. // Constant arrays of zero length always count as empty. if (AllowArrays) while (const ConstantArrayType *AT = Context.getAsConstantArrayType(FT)) { if (AT->getSize() == 0) return true; FT = AT->getElementType(); } const RecordType *RT = FT->getAs(); if (!RT) return false; // C++ record fields are never empty, at least in the Itanium ABI. // // FIXME: We should use a predicate for whether this behavior is true in the // current ABI. if (isa(RT->getDecl())) return false; return isEmptyRecord(Context, FT, AllowArrays); } /// isEmptyRecord - Return true iff a structure contains only empty /// fields. Note that a structure with a flexible array member is not /// considered empty. static bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays) { const RecordType *RT = T->getAs(); if (!RT) return 0; const RecordDecl *RD = RT->getDecl(); if (RD->hasFlexibleArrayMember()) return false; // If this is a C++ record, check the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) for (const auto &I : CXXRD->bases()) if (!isEmptyRecord(Context, I.getType(), true)) return false; for (const auto *I : RD->fields()) if (!isEmptyField(Context, I, AllowArrays)) return false; return true; } /// isSingleElementStruct - Determine if a structure is a "single /// element struct", i.e. it has exactly one non-empty field or /// exactly one field which is itself a single element /// struct. Structures with flexible array members are never /// considered single element structs. /// /// \return The field declaration for the single non-empty field, if /// it exists. static const Type *isSingleElementStruct(QualType T, ASTContext &Context) { const RecordType *RT = T->getAsStructureType(); if (!RT) return nullptr; const RecordDecl *RD = RT->getDecl(); if (RD->hasFlexibleArrayMember()) return nullptr; const Type *Found = nullptr; // If this is a C++ record, check the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) { for (const auto &I : CXXRD->bases()) { // Ignore empty records. if (isEmptyRecord(Context, I.getType(), true)) continue; // If we already found an element then this isn't a single-element struct. if (Found) return nullptr; // If this is non-empty and not a single element struct, the composite // cannot be a single element struct. Found = isSingleElementStruct(I.getType(), Context); if (!Found) return nullptr; } } // Check for single element. for (const auto *FD : RD->fields()) { QualType FT = FD->getType(); // Ignore empty fields. if (isEmptyField(Context, FD, true)) continue; // If we already found an element then this isn't a single-element // struct. if (Found) return nullptr; // Treat single element arrays as the element. while (const ConstantArrayType *AT = Context.getAsConstantArrayType(FT)) { if (AT->getSize().getZExtValue() != 1) break; FT = AT->getElementType(); } if (!isAggregateTypeForABI(FT)) { Found = FT.getTypePtr(); } else { Found = isSingleElementStruct(FT, Context); if (!Found) return nullptr; } } // We don't consider a struct a single-element struct if it has // padding beyond the element type. if (Found && Context.getTypeSize(Found) != Context.getTypeSize(T)) return nullptr; return Found; } static bool is32Or64BitBasicType(QualType Ty, ASTContext &Context) { // Treat complex types as the element type. if (const ComplexType *CTy = Ty->getAs()) Ty = CTy->getElementType(); // Check for a type which we know has a simple scalar argument-passing // convention without any padding. (We're specifically looking for 32 // and 64-bit integer and integer-equivalents, float, and double.) if (!Ty->getAs() && !Ty->hasPointerRepresentation() && !Ty->isEnumeralType() && !Ty->isBlockPointerType()) return false; uint64_t Size = Context.getTypeSize(Ty); return Size == 32 || Size == 64; } /// canExpandIndirectArgument - Test whether an argument type which is to be /// passed indirectly (on the stack) would have the equivalent layout if it was /// expanded into separate arguments. If so, we prefer to do the latter to avoid /// inhibiting optimizations. /// // FIXME: This predicate is missing many cases, currently it just follows // llvm-gcc (checks that all fields are 32-bit or 64-bit primitive types). We // should probably make this smarter, or better yet make the LLVM backend // capable of handling it. static bool canExpandIndirectArgument(QualType Ty, ASTContext &Context) { // We can only expand structure types. const RecordType *RT = Ty->getAs(); if (!RT) return false; // We can only expand (C) structures. // // FIXME: This needs to be generalized to handle classes as well. const RecordDecl *RD = RT->getDecl(); if (!RD->isStruct() || isa(RD)) return false; uint64_t Size = 0; for (const auto *FD : RD->fields()) { if (!is32Or64BitBasicType(FD->getType(), Context)) return false; // FIXME: Reject bit-fields wholesale; there are two problems, we don't know // how to expand them yet, and the predicate for telling if a bitfield still // counts as "basic" is more complicated than what we were doing previously. if (FD->isBitField()) return false; Size += Context.getTypeSize(FD->getType()); } // Make sure there are not any holes in the struct. if (Size != Context.getTypeSize(Ty)) return false; return true; } namespace { /// DefaultABIInfo - The default implementation for ABI specific /// details. This implementation provides information which results in /// self-consistent and sensible LLVM IR generation, but does not /// conform to any particular ABI. class DefaultABIInfo : public ABIInfo { public: DefaultABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {} ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType RetTy) const; void computeInfo(CGFunctionInfo &FI) const override { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &I : FI.arguments()) I.info = classifyArgumentType(I.type); } llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class DefaultTargetCodeGenInfo : public TargetCodeGenInfo { public: DefaultTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT) : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {} }; llvm::Value *DefaultABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { return nullptr; } ABIArgInfo DefaultABIInfo::classifyArgumentType(QualType Ty) const { if (isAggregateTypeForABI(Ty)) return ABIArgInfo::getIndirect(0); // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } ABIArgInfo DefaultABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); if (isAggregateTypeForABI(RetTy)) return ABIArgInfo::getIndirect(0); // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } //===----------------------------------------------------------------------===// // le32/PNaCl bitcode ABI Implementation // // This is a simplified version of the x86_32 ABI. Arguments and return values // are always passed on the stack. //===----------------------------------------------------------------------===// class PNaClABIInfo : public ABIInfo { public: PNaClABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {} ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType RetTy) const; void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class PNaClTargetCodeGenInfo : public TargetCodeGenInfo { public: PNaClTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT) : TargetCodeGenInfo(new PNaClABIInfo(CGT)) {} }; void PNaClABIInfo::computeInfo(CGFunctionInfo &FI) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &I : FI.arguments()) I.info = classifyArgumentType(I.type); } llvm::Value *PNaClABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { return nullptr; } /// \brief Classify argument of given type \p Ty. ABIArgInfo PNaClABIInfo::classifyArgumentType(QualType Ty) const { if (isAggregateTypeForABI(Ty)) { if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); return ABIArgInfo::getIndirect(0); } else if (const EnumType *EnumTy = Ty->getAs()) { // Treat an enum type as its underlying type. Ty = EnumTy->getDecl()->getIntegerType(); } else if (Ty->isFloatingType()) { // Floating-point types don't go inreg. return ABIArgInfo::getDirect(); } return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } ABIArgInfo PNaClABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); // In the PNaCl ABI we always return records/structures on the stack. if (isAggregateTypeForABI(RetTy)) return ABIArgInfo::getIndirect(0); // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } /// IsX86_MMXType - Return true if this is an MMX type. bool IsX86_MMXType(llvm::Type *IRType) { // Return true if the type is an MMX type <2 x i32>, <4 x i16>, or <8 x i8>. return IRType->isVectorTy() && IRType->getPrimitiveSizeInBits() == 64 && cast(IRType)->getElementType()->isIntegerTy() && IRType->getScalarSizeInBits() != 64; } static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF, StringRef Constraint, llvm::Type* Ty) { if ((Constraint == "y" || Constraint == "&y") && Ty->isVectorTy()) { if (cast(Ty)->getBitWidth() != 64) { // Invalid MMX constraint return nullptr; } return llvm::Type::getX86_MMXTy(CGF.getLLVMContext()); } // No operation needed return Ty; } /// Returns true if this type can be passed in SSE registers with the /// X86_VectorCall calling convention. Shared between x86_32 and x86_64. static bool isX86VectorTypeForVectorCall(ASTContext &Context, QualType Ty) { if (const BuiltinType *BT = Ty->getAs()) { if (BT->isFloatingPoint() && BT->getKind() != BuiltinType::Half) return true; } else if (const VectorType *VT = Ty->getAs()) { // vectorcall can pass XMM, YMM, and ZMM vectors. We don't pass SSE1 MMX // registers specially. unsigned VecSize = Context.getTypeSize(VT); if (VecSize == 128 || VecSize == 256 || VecSize == 512) return true; } return false; } /// Returns true if this aggregate is small enough to be passed in SSE registers /// in the X86_VectorCall calling convention. Shared between x86_32 and x86_64. static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) { return NumMembers <= 4; } //===----------------------------------------------------------------------===// // X86-32 ABI Implementation //===----------------------------------------------------------------------===// /// \brief Similar to llvm::CCState, but for Clang. struct CCState { CCState(unsigned CC) : CC(CC), FreeRegs(0), FreeSSERegs(0) {} unsigned CC; unsigned FreeRegs; unsigned FreeSSERegs; }; /// X86_32ABIInfo - The X86-32 ABI information. class X86_32ABIInfo : public ABIInfo { enum Class { Integer, Float }; static const unsigned MinABIStackAlignInBytes = 4; bool IsDarwinVectorABI; bool IsSmallStructInRegABI; bool IsWin32StructABI; unsigned DefaultNumRegisterParameters; static bool isRegisterSize(unsigned Size) { return (Size == 8 || Size == 16 || Size == 32 || Size == 64); } bool isHomogeneousAggregateBaseType(QualType Ty) const override { // FIXME: Assumes vectorcall is in use. return isX86VectorTypeForVectorCall(getContext(), Ty); } bool isHomogeneousAggregateSmallEnough(const Type *Ty, uint64_t NumMembers) const override { // FIXME: Assumes vectorcall is in use. return isX86VectorCallAggregateSmallEnough(NumMembers); } bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context) const; /// getIndirectResult - Give a source type \arg Ty, return a suitable result /// such that the argument will be passed in memory. ABIArgInfo getIndirectResult(QualType Ty, bool ByVal, CCState &State) const; ABIArgInfo getIndirectReturnResult(CCState &State) const; /// \brief Return the alignment to use for the given type on the stack. unsigned getTypeStackAlignInBytes(QualType Ty, unsigned Align) const; Class classify(QualType Ty) const; ABIArgInfo classifyReturnType(QualType RetTy, CCState &State) const; ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const; bool shouldUseInReg(QualType Ty, CCState &State, bool &NeedsPadding) const; /// \brief Rewrite the function info so that all memory arguments use /// inalloca. void rewriteWithInAlloca(CGFunctionInfo &FI) const; void addFieldToArgStruct(SmallVector &FrameFields, unsigned &StackOffset, ABIArgInfo &Info, QualType Type) const; public: void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; X86_32ABIInfo(CodeGen::CodeGenTypes &CGT, bool d, bool p, bool w, unsigned r) : ABIInfo(CGT), IsDarwinVectorABI(d), IsSmallStructInRegABI(p), IsWin32StructABI(w), DefaultNumRegisterParameters(r) {} }; class X86_32TargetCodeGenInfo : public TargetCodeGenInfo { public: X86_32TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool d, bool p, bool w, unsigned r) :TargetCodeGenInfo(new X86_32ABIInfo(CGT, d, p, w, r)) {} static bool isStructReturnInRegABI( const llvm::Triple &Triple, const CodeGenOptions &Opts); void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const override; int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override { // Darwin uses different dwarf register numbers for EH. if (CGM.getTarget().getTriple().isOSDarwin()) return 5; return 4; } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override; llvm::Type* adjustInlineAsmType(CodeGen::CodeGenFunction &CGF, StringRef Constraint, llvm::Type* Ty) const override { return X86AdjustInlineAsmType(CGF, Constraint, Ty); } void addReturnRegisterOutputs(CodeGenFunction &CGF, LValue ReturnValue, std::string &Constraints, std::vector &ResultRegTypes, std::vector &ResultTruncRegTypes, std::vector &ResultRegDests, std::string &AsmString, unsigned NumOutputs) const override; llvm::Constant * getUBSanFunctionSignature(CodeGen::CodeGenModule &CGM) const override { unsigned Sig = (0xeb << 0) | // jmp rel8 (0x06 << 8) | // .+0x08 ('F' << 16) | ('T' << 24); return llvm::ConstantInt::get(CGM.Int32Ty, Sig); } }; } /// Rewrite input constraint references after adding some output constraints. /// In the case where there is one output and one input and we add one output, /// we need to replace all operand references greater than or equal to 1: /// mov $0, $1 /// mov eax, $1 /// The result will be: /// mov $0, $2 /// mov eax, $2 static void rewriteInputConstraintReferences(unsigned FirstIn, unsigned NumNewOuts, std::string &AsmString) { std::string Buf; llvm::raw_string_ostream OS(Buf); size_t Pos = 0; while (Pos < AsmString.size()) { size_t DollarStart = AsmString.find('$', Pos); if (DollarStart == std::string::npos) DollarStart = AsmString.size(); size_t DollarEnd = AsmString.find_first_not_of('$', DollarStart); if (DollarEnd == std::string::npos) DollarEnd = AsmString.size(); OS << StringRef(&AsmString[Pos], DollarEnd - Pos); Pos = DollarEnd; size_t NumDollars = DollarEnd - DollarStart; if (NumDollars % 2 != 0 && Pos < AsmString.size()) { // We have an operand reference. size_t DigitStart = Pos; size_t DigitEnd = AsmString.find_first_not_of("0123456789", DigitStart); if (DigitEnd == std::string::npos) DigitEnd = AsmString.size(); StringRef OperandStr(&AsmString[DigitStart], DigitEnd - DigitStart); unsigned OperandIndex; if (!OperandStr.getAsInteger(10, OperandIndex)) { if (OperandIndex >= FirstIn) OperandIndex += NumNewOuts; OS << OperandIndex; } else { OS << OperandStr; } Pos = DigitEnd; } } AsmString = std::move(OS.str()); } /// Add output constraints for EAX:EDX because they are return registers. void X86_32TargetCodeGenInfo::addReturnRegisterOutputs( CodeGenFunction &CGF, LValue ReturnSlot, std::string &Constraints, std::vector &ResultRegTypes, std::vector &ResultTruncRegTypes, std::vector &ResultRegDests, std::string &AsmString, unsigned NumOutputs) const { uint64_t RetWidth = CGF.getContext().getTypeSize(ReturnSlot.getType()); // Use the EAX constraint if the width is 32 or smaller and EAX:EDX if it is // larger. if (!Constraints.empty()) Constraints += ','; if (RetWidth <= 32) { Constraints += "={eax}"; ResultRegTypes.push_back(CGF.Int32Ty); } else { // Use the 'A' constraint for EAX:EDX. Constraints += "=A"; ResultRegTypes.push_back(CGF.Int64Ty); } // Truncate EAX or EAX:EDX to an integer of the appropriate size. llvm::Type *CoerceTy = llvm::IntegerType::get(CGF.getLLVMContext(), RetWidth); ResultTruncRegTypes.push_back(CoerceTy); // Coerce the integer by bitcasting the return slot pointer. ReturnSlot.setAddress(CGF.Builder.CreateBitCast(ReturnSlot.getAddress(), CoerceTy->getPointerTo())); ResultRegDests.push_back(ReturnSlot); rewriteInputConstraintReferences(NumOutputs, 1, AsmString); } /// shouldReturnTypeInRegister - Determine if the given type should be /// passed in a register (for the Darwin ABI). bool X86_32ABIInfo::shouldReturnTypeInRegister(QualType Ty, ASTContext &Context) const { uint64_t Size = Context.getTypeSize(Ty); // Type must be register sized. if (!isRegisterSize(Size)) return false; if (Ty->isVectorType()) { // 64- and 128- bit vectors inside structures are not returned in // registers. if (Size == 64 || Size == 128) return false; return true; } // If this is a builtin, pointer, enum, complex type, member pointer, or // member function pointer it is ok. if (Ty->getAs() || Ty->hasPointerRepresentation() || Ty->isAnyComplexType() || Ty->isEnumeralType() || Ty->isBlockPointerType() || Ty->isMemberPointerType()) return true; // Arrays are treated like records. if (const ConstantArrayType *AT = Context.getAsConstantArrayType(Ty)) return shouldReturnTypeInRegister(AT->getElementType(), Context); // Otherwise, it must be a record type. const RecordType *RT = Ty->getAs(); if (!RT) return false; // FIXME: Traverse bases here too. // Structure types are passed in register if all fields would be // passed in a register. for (const auto *FD : RT->getDecl()->fields()) { // Empty fields are ignored. if (isEmptyField(Context, FD, true)) continue; // Check fields recursively. if (!shouldReturnTypeInRegister(FD->getType(), Context)) return false; } return true; } ABIArgInfo X86_32ABIInfo::getIndirectReturnResult(CCState &State) const { // If the return value is indirect, then the hidden argument is consuming one // integer register. if (State.FreeRegs) { --State.FreeRegs; return ABIArgInfo::getIndirectInReg(/*Align=*/0, /*ByVal=*/false); } return ABIArgInfo::getIndirect(/*Align=*/0, /*ByVal=*/false); } ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy, CCState &State) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); const Type *Base = nullptr; uint64_t NumElts = 0; if (State.CC == llvm::CallingConv::X86_VectorCall && isHomogeneousAggregate(RetTy, Base, NumElts)) { // The LLVM struct type for such an aggregate should lower properly. return ABIArgInfo::getDirect(); } if (const VectorType *VT = RetTy->getAs()) { // On Darwin, some vectors are returned in registers. if (IsDarwinVectorABI) { uint64_t Size = getContext().getTypeSize(RetTy); // 128-bit vectors are a special case; they are returned in // registers and we need to make sure to pick a type the LLVM // backend will like. if (Size == 128) return ABIArgInfo::getDirect(llvm::VectorType::get( llvm::Type::getInt64Ty(getVMContext()), 2)); // Always return in register if it fits in a general purpose // register, or if it is 64 bits and has a single element. if ((Size == 8 || Size == 16 || Size == 32) || (Size == 64 && VT->getNumElements() == 1)) return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size)); return getIndirectReturnResult(State); } return ABIArgInfo::getDirect(); } if (isAggregateTypeForABI(RetTy)) { if (const RecordType *RT = RetTy->getAs()) { // Structures with flexible arrays are always indirect. if (RT->getDecl()->hasFlexibleArrayMember()) return getIndirectReturnResult(State); } // If specified, structs and unions are always indirect. if (!IsSmallStructInRegABI && !RetTy->isAnyComplexType()) return getIndirectReturnResult(State); // Small structures which are register sized are generally returned // in a register. if (shouldReturnTypeInRegister(RetTy, getContext())) { uint64_t Size = getContext().getTypeSize(RetTy); // As a special-case, if the struct is a "single-element" struct, and // the field is of type "float" or "double", return it in a // floating-point register. (MSVC does not apply this special case.) // We apply a similar transformation for pointer types to improve the // quality of the generated IR. if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) if ((!IsWin32StructABI && SeltTy->isRealFloatingType()) || SeltTy->hasPointerRepresentation()) return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); // FIXME: We should be able to narrow this integer in cases with dead // padding. return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),Size)); } return getIndirectReturnResult(State); } // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } static bool isSSEVectorType(ASTContext &Context, QualType Ty) { return Ty->getAs() && Context.getTypeSize(Ty) == 128; } static bool isRecordWithSSEVectorType(ASTContext &Context, QualType Ty) { const RecordType *RT = Ty->getAs(); if (!RT) return 0; const RecordDecl *RD = RT->getDecl(); // If this is a C++ record, check the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) for (const auto &I : CXXRD->bases()) if (!isRecordWithSSEVectorType(Context, I.getType())) return false; for (const auto *i : RD->fields()) { QualType FT = i->getType(); if (isSSEVectorType(Context, FT)) return true; if (isRecordWithSSEVectorType(Context, FT)) return true; } return false; } unsigned X86_32ABIInfo::getTypeStackAlignInBytes(QualType Ty, unsigned Align) const { // Otherwise, if the alignment is less than or equal to the minimum ABI // alignment, just use the default; the backend will handle this. if (Align <= MinABIStackAlignInBytes) return 0; // Use default alignment. // On non-Darwin, the stack type alignment is always 4. if (!IsDarwinVectorABI) { // Set explicit alignment, since we may need to realign the top. return MinABIStackAlignInBytes; } // Otherwise, if the type contains an SSE vector type, the alignment is 16. if (Align >= 16 && (isSSEVectorType(getContext(), Ty) || isRecordWithSSEVectorType(getContext(), Ty))) return 16; return MinABIStackAlignInBytes; } ABIArgInfo X86_32ABIInfo::getIndirectResult(QualType Ty, bool ByVal, CCState &State) const { if (!ByVal) { if (State.FreeRegs) { --State.FreeRegs; // Non-byval indirects just use one pointer. return ABIArgInfo::getIndirectInReg(0, false); } return ABIArgInfo::getIndirect(0, false); } // Compute the byval alignment. unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8; unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign); if (StackAlign == 0) return ABIArgInfo::getIndirect(4, /*ByVal=*/true); // If the stack alignment is less than the type alignment, realign the // argument. bool Realign = TypeAlign > StackAlign; return ABIArgInfo::getIndirect(StackAlign, /*ByVal=*/true, Realign); } X86_32ABIInfo::Class X86_32ABIInfo::classify(QualType Ty) const { const Type *T = isSingleElementStruct(Ty, getContext()); if (!T) T = Ty.getTypePtr(); if (const BuiltinType *BT = T->getAs()) { BuiltinType::Kind K = BT->getKind(); if (K == BuiltinType::Float || K == BuiltinType::Double) return Float; } return Integer; } bool X86_32ABIInfo::shouldUseInReg(QualType Ty, CCState &State, bool &NeedsPadding) const { NeedsPadding = false; Class C = classify(Ty); if (C == Float) return false; unsigned Size = getContext().getTypeSize(Ty); unsigned SizeInRegs = (Size + 31) / 32; if (SizeInRegs == 0) return false; if (SizeInRegs > State.FreeRegs) { State.FreeRegs = 0; return false; } State.FreeRegs -= SizeInRegs; if (State.CC == llvm::CallingConv::X86_FastCall || State.CC == llvm::CallingConv::X86_VectorCall) { if (Size > 32) return false; if (Ty->isIntegralOrEnumerationType()) return true; if (Ty->isPointerType()) return true; if (Ty->isReferenceType()) return true; if (State.FreeRegs) NeedsPadding = true; return false; } return true; } ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State) const { // FIXME: Set alignment on indirect arguments. Ty = useFirstFieldIfTransparentUnion(Ty); // Check with the C++ ABI first. const RecordType *RT = Ty->getAs(); if (RT) { CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI()); if (RAA == CGCXXABI::RAA_Indirect) { return getIndirectResult(Ty, false, State); } else if (RAA == CGCXXABI::RAA_DirectInMemory) { // The field index doesn't matter, we'll fix it up later. return ABIArgInfo::getInAlloca(/*FieldIndex=*/0); } } // vectorcall adds the concept of a homogenous vector aggregate, similar // to other targets. const Type *Base = nullptr; uint64_t NumElts = 0; if (State.CC == llvm::CallingConv::X86_VectorCall && isHomogeneousAggregate(Ty, Base, NumElts)) { if (State.FreeSSERegs >= NumElts) { State.FreeSSERegs -= NumElts; if (Ty->isBuiltinType() || Ty->isVectorType()) return ABIArgInfo::getDirect(); return ABIArgInfo::getExpand(); } return getIndirectResult(Ty, /*ByVal=*/false, State); } if (isAggregateTypeForABI(Ty)) { if (RT) { // Structs are always byval on win32, regardless of what they contain. if (IsWin32StructABI) return getIndirectResult(Ty, true, State); // Structures with flexible arrays are always indirect. if (RT->getDecl()->hasFlexibleArrayMember()) return getIndirectResult(Ty, true, State); } // Ignore empty structs/unions. if (isEmptyRecord(getContext(), Ty, true)) return ABIArgInfo::getIgnore(); llvm::LLVMContext &LLVMContext = getVMContext(); llvm::IntegerType *Int32 = llvm::Type::getInt32Ty(LLVMContext); bool NeedsPadding; if (shouldUseInReg(Ty, State, NeedsPadding)) { unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32; SmallVector Elements(SizeInRegs, Int32); llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements); return ABIArgInfo::getDirectInReg(Result); } llvm::IntegerType *PaddingType = NeedsPadding ? Int32 : nullptr; // Expand small (<= 128-bit) record types when we know that the stack layout // of those arguments will match the struct. This is important because the // LLVM backend isn't smart enough to remove byval, which inhibits many // optimizations. if (getContext().getTypeSize(Ty) <= 4*32 && canExpandIndirectArgument(Ty, getContext())) return ABIArgInfo::getExpandWithPadding( State.CC == llvm::CallingConv::X86_FastCall || State.CC == llvm::CallingConv::X86_VectorCall, PaddingType); return getIndirectResult(Ty, true, State); } if (const VectorType *VT = Ty->getAs()) { // On Darwin, some vectors are passed in memory, we handle this by passing // it as an i8/i16/i32/i64. if (IsDarwinVectorABI) { uint64_t Size = getContext().getTypeSize(Ty); if ((Size == 8 || Size == 16 || Size == 32) || (Size == 64 && VT->getNumElements() == 1)) return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size)); } if (IsX86_MMXType(CGT.ConvertType(Ty))) return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), 64)); return ABIArgInfo::getDirect(); } if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); bool NeedsPadding; bool InReg = shouldUseInReg(Ty, State, NeedsPadding); if (Ty->isPromotableIntegerType()) { if (InReg) return ABIArgInfo::getExtendInReg(); return ABIArgInfo::getExtend(); } if (InReg) return ABIArgInfo::getDirectInReg(); return ABIArgInfo::getDirect(); } void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const { CCState State(FI.getCallingConvention()); if (State.CC == llvm::CallingConv::X86_FastCall) State.FreeRegs = 2; else if (State.CC == llvm::CallingConv::X86_VectorCall) { State.FreeRegs = 2; State.FreeSSERegs = 6; } else if (FI.getHasRegParm()) State.FreeRegs = FI.getRegParm(); else State.FreeRegs = DefaultNumRegisterParameters; if (!getCXXABI().classifyReturnType(FI)) { FI.getReturnInfo() = classifyReturnType(FI.getReturnType(), State); } else if (FI.getReturnInfo().isIndirect()) { // The C++ ABI is not aware of register usage, so we have to check if the // return value was sret and put it in a register ourselves if appropriate. if (State.FreeRegs) { --State.FreeRegs; // The sret parameter consumes a register. FI.getReturnInfo().setInReg(true); } } // The chain argument effectively gives us another free register. if (FI.isChainCall()) ++State.FreeRegs; bool UsedInAlloca = false; for (auto &I : FI.arguments()) { I.info = classifyArgumentType(I.type, State); UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); } // If we needed to use inalloca for any argument, do a second pass and rewrite // all the memory arguments to use inalloca. if (UsedInAlloca) rewriteWithInAlloca(FI); } void X86_32ABIInfo::addFieldToArgStruct(SmallVector &FrameFields, unsigned &StackOffset, ABIArgInfo &Info, QualType Type) const { assert(StackOffset % 4U == 0 && "unaligned inalloca struct"); Info = ABIArgInfo::getInAlloca(FrameFields.size()); FrameFields.push_back(CGT.ConvertTypeForMem(Type)); StackOffset += getContext().getTypeSizeInChars(Type).getQuantity(); // Insert padding bytes to respect alignment. For x86_32, each argument is 4 // byte aligned. if (StackOffset % 4U) { unsigned OldOffset = StackOffset; StackOffset = llvm::RoundUpToAlignment(StackOffset, 4U); unsigned NumBytes = StackOffset - OldOffset; assert(NumBytes); llvm::Type *Ty = llvm::Type::getInt8Ty(getVMContext()); Ty = llvm::ArrayType::get(Ty, NumBytes); FrameFields.push_back(Ty); } } static bool isArgInAlloca(const ABIArgInfo &Info) { // Leave ignored and inreg arguments alone. switch (Info.getKind()) { case ABIArgInfo::InAlloca: return true; case ABIArgInfo::Indirect: assert(Info.getIndirectByVal()); return true; case ABIArgInfo::Ignore: return false; case ABIArgInfo::Direct: case ABIArgInfo::Extend: case ABIArgInfo::Expand: if (Info.getInReg()) return false; return true; } llvm_unreachable("invalid enum"); } void X86_32ABIInfo::rewriteWithInAlloca(CGFunctionInfo &FI) const { assert(IsWin32StructABI && "inalloca only supported on win32"); // Build a packed struct type for all of the arguments in memory. SmallVector FrameFields; unsigned StackOffset = 0; CGFunctionInfo::arg_iterator I = FI.arg_begin(), E = FI.arg_end(); // Put 'this' into the struct before 'sret', if necessary. bool IsThisCall = FI.getCallingConvention() == llvm::CallingConv::X86_ThisCall; ABIArgInfo &Ret = FI.getReturnInfo(); if (Ret.isIndirect() && Ret.isSRetAfterThis() && !IsThisCall && isArgInAlloca(I->info)) { addFieldToArgStruct(FrameFields, StackOffset, I->info, I->type); ++I; } // Put the sret parameter into the inalloca struct if it's in memory. if (Ret.isIndirect() && !Ret.getInReg()) { CanQualType PtrTy = getContext().getPointerType(FI.getReturnType()); addFieldToArgStruct(FrameFields, StackOffset, Ret, PtrTy); // On Windows, the hidden sret parameter is always returned in eax. Ret.setInAllocaSRet(IsWin32StructABI); } // Skip the 'this' parameter in ecx. if (IsThisCall) ++I; // Put arguments passed in memory into the struct. for (; I != E; ++I) { if (isArgInAlloca(I->info)) addFieldToArgStruct(FrameFields, StackOffset, I->info, I->type); } FI.setArgStruct(llvm::StructType::get(getVMContext(), FrameFields, /*isPacked=*/true)); } llvm::Value *X86_32ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { llvm::Type *BPP = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); // Compute if the address needs to be aligned unsigned Align = CGF.getContext().getTypeAlignInChars(Ty).getQuantity(); Align = getTypeStackAlignInBytes(Ty, Align); Align = std::max(Align, 4U); if (Align > 4) { // addr = (addr + align - 1) & -align; llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int32Ty, Align - 1); Addr = CGF.Builder.CreateGEP(Addr, Offset); llvm::Value *AsInt = CGF.Builder.CreatePtrToInt(Addr, CGF.Int32Ty); llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int32Ty, -Align); Addr = CGF.Builder.CreateIntToPtr(CGF.Builder.CreateAnd(AsInt, Mask), Addr->getType(), "ap.cur.aligned"); } llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy); uint64_t Offset = llvm::RoundUpToAlignment(CGF.getContext().getTypeSize(Ty) / 8, Align); llvm::Value *NextAddr = Builder.CreateGEP(Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); return AddrTyped; } bool X86_32TargetCodeGenInfo::isStructReturnInRegABI( const llvm::Triple &Triple, const CodeGenOptions &Opts) { assert(Triple.getArch() == llvm::Triple::x86); switch (Opts.getStructReturnConvention()) { case CodeGenOptions::SRCK_Default: break; case CodeGenOptions::SRCK_OnStack: // -fpcc-struct-return return false; case CodeGenOptions::SRCK_InRegs: // -freg-struct-return return true; } if (Triple.isOSDarwin()) return true; switch (Triple.getOS()) { case llvm::Triple::DragonFly: case llvm::Triple::FreeBSD: case llvm::Triple::OpenBSD: case llvm::Triple::Bitrig: case llvm::Triple::Win32: return true; default: return false; } } void X86_32TargetCodeGenInfo::SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const { if (const FunctionDecl *FD = dyn_cast(D)) { if (FD->hasAttr()) { // Get the LLVM function. llvm::Function *Fn = cast(GV); // Now add the 'alignstack' attribute with a value of 16. llvm::AttrBuilder B; B.addStackAlignmentAttr(16); Fn->addAttributes(llvm::AttributeSet::FunctionIndex, llvm::AttributeSet::get(CGM.getLLVMContext(), llvm::AttributeSet::FunctionIndex, B)); } } } bool X86_32TargetCodeGenInfo::initDwarfEHRegSizeTable( CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const { CodeGen::CGBuilderTy &Builder = CGF.Builder; llvm::Value *Four8 = llvm::ConstantInt::get(CGF.Int8Ty, 4); // 0-7 are the eight integer registers; the order is different // on Darwin (for EH), but the range is the same. // 8 is %eip. AssignToArrayRange(Builder, Address, Four8, 0, 8); if (CGF.CGM.getTarget().getTriple().isOSDarwin()) { // 12-16 are st(0..4). Not sure why we stop at 4. // These have size 16, which is sizeof(long double) on // platforms with 8-byte alignment for that type. llvm::Value *Sixteen8 = llvm::ConstantInt::get(CGF.Int8Ty, 16); AssignToArrayRange(Builder, Address, Sixteen8, 12, 16); } else { // 9 is %eflags, which doesn't get a size on Darwin for some // reason. Builder.CreateStore(Four8, Builder.CreateConstInBoundsGEP1_32(Address, 9)); // 11-16 are st(0..5). Not sure why we stop at 5. // These have size 12, which is sizeof(long double) on // platforms with 4-byte alignment for that type. llvm::Value *Twelve8 = llvm::ConstantInt::get(CGF.Int8Ty, 12); AssignToArrayRange(Builder, Address, Twelve8, 11, 16); } return false; } //===----------------------------------------------------------------------===// // X86-64 ABI Implementation //===----------------------------------------------------------------------===// namespace { /// X86_64ABIInfo - The X86_64 ABI information. class X86_64ABIInfo : public ABIInfo { enum Class { Integer = 0, SSE, SSEUp, X87, X87Up, ComplexX87, NoClass, Memory }; /// merge - Implement the X86_64 ABI merging algorithm. /// /// Merge an accumulating classification \arg Accum with a field /// classification \arg Field. /// /// \param Accum - The accumulating classification. This should /// always be either NoClass or the result of a previous merge /// call. In addition, this should never be Memory (the caller /// should just return Memory for the aggregate). static Class merge(Class Accum, Class Field); /// postMerge - Implement the X86_64 ABI post merging algorithm. /// /// Post merger cleanup, reduces a malformed Hi and Lo pair to /// final MEMORY or SSE classes when necessary. /// /// \param AggregateSize - The size of the current aggregate in /// the classification process. /// /// \param Lo - The classification for the parts of the type /// residing in the low word of the containing object. /// /// \param Hi - The classification for the parts of the type /// residing in the higher words of the containing object. /// void postMerge(unsigned AggregateSize, Class &Lo, Class &Hi) const; /// classify - Determine the x86_64 register classes in which the /// given type T should be passed. /// /// \param Lo - The classification for the parts of the type /// residing in the low word of the containing object. /// /// \param Hi - The classification for the parts of the type /// residing in the high word of the containing object. /// /// \param OffsetBase - The bit offset of this type in the /// containing object. Some parameters are classified different /// depending on whether they straddle an eightbyte boundary. /// /// \param isNamedArg - Whether the argument in question is a "named" /// argument, as used in AMD64-ABI 3.5.7. /// /// If a word is unused its result will be NoClass; if a type should /// be passed in Memory then at least the classification of \arg Lo /// will be Memory. /// /// The \arg Lo class will be NoClass iff the argument is ignored. /// /// If the \arg Lo class is ComplexX87, then the \arg Hi class will /// also be ComplexX87. void classify(QualType T, uint64_t OffsetBase, Class &Lo, Class &Hi, bool isNamedArg) const; llvm::Type *GetByteVectorType(QualType Ty) const; llvm::Type *GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset, QualType SourceTy, unsigned SourceOffset) const; llvm::Type *GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset, QualType SourceTy, unsigned SourceOffset) const; /// getIndirectResult - Give a source type \arg Ty, return a suitable result /// such that the argument will be returned in memory. ABIArgInfo getIndirectReturnResult(QualType Ty) const; /// getIndirectResult - Give a source type \arg Ty, return a suitable result /// such that the argument will be passed in memory. /// /// \param freeIntRegs - The number of free integer registers remaining /// available. ABIArgInfo getIndirectResult(QualType Ty, unsigned freeIntRegs) const; ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType Ty, unsigned freeIntRegs, unsigned &neededInt, unsigned &neededSSE, bool isNamedArg) const; bool IsIllegalVectorType(QualType Ty) const; /// The 0.98 ABI revision clarified a lot of ambiguities, /// unfortunately in ways that were not always consistent with /// certain previous compilers. In particular, platforms which /// required strict binary compatibility with older versions of GCC /// may need to exempt themselves. bool honorsRevision0_98() const { return !getTarget().getTriple().isOSDarwin(); } bool HasAVX; // Some ABIs (e.g. X32 ABI and Native Client OS) use 32 bit pointers on // 64-bit hardware. bool Has64BitPointers; public: X86_64ABIInfo(CodeGen::CodeGenTypes &CGT, bool hasavx) : ABIInfo(CGT), HasAVX(hasavx), Has64BitPointers(CGT.getDataLayout().getPointerSize(0) == 8) { } bool isPassedUsingAVXType(QualType type) const { unsigned neededInt, neededSSE; // The freeIntRegs argument doesn't matter here. ABIArgInfo info = classifyArgumentType(type, 0, neededInt, neededSSE, /*isNamedArg*/true); if (info.isDirect()) { llvm::Type *ty = info.getCoerceToType(); if (llvm::VectorType *vectorTy = dyn_cast_or_null(ty)) return (vectorTy->getBitWidth() > 128); } return false; } void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; /// WinX86_64ABIInfo - The Windows X86_64 ABI information. class WinX86_64ABIInfo : public ABIInfo { ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs, bool IsReturnType) const; public: WinX86_64ABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {} void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; bool isHomogeneousAggregateBaseType(QualType Ty) const override { // FIXME: Assumes vectorcall is in use. return isX86VectorTypeForVectorCall(getContext(), Ty); } bool isHomogeneousAggregateSmallEnough(const Type *Ty, uint64_t NumMembers) const override { // FIXME: Assumes vectorcall is in use. return isX86VectorCallAggregateSmallEnough(NumMembers); } }; class X86_64TargetCodeGenInfo : public TargetCodeGenInfo { bool HasAVX; public: X86_64TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool HasAVX) : TargetCodeGenInfo(new X86_64ABIInfo(CGT, HasAVX)), HasAVX(HasAVX) {} const X86_64ABIInfo &getABIInfo() const { return static_cast(TargetCodeGenInfo::getABIInfo()); } int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override { return 7; } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override { llvm::Value *Eight8 = llvm::ConstantInt::get(CGF.Int8Ty, 8); // 0-15 are the 16 integer registers. // 16 is %rip. AssignToArrayRange(CGF.Builder, Address, Eight8, 0, 16); return false; } llvm::Type* adjustInlineAsmType(CodeGen::CodeGenFunction &CGF, StringRef Constraint, llvm::Type* Ty) const override { return X86AdjustInlineAsmType(CGF, Constraint, Ty); } bool isNoProtoCallVariadic(const CallArgList &args, const FunctionNoProtoType *fnType) const override { // The default CC on x86-64 sets %al to the number of SSA // registers used, and GCC sets this when calling an unprototyped // function, so we override the default behavior. However, don't do // that when AVX types are involved: the ABI explicitly states it is // undefined, and it doesn't work in practice because of how the ABI // defines varargs anyway. if (fnType->getCallConv() == CC_C) { bool HasAVXType = false; for (CallArgList::const_iterator it = args.begin(), ie = args.end(); it != ie; ++it) { if (getABIInfo().isPassedUsingAVXType(it->Ty)) { HasAVXType = true; break; } } if (!HasAVXType) return true; } return TargetCodeGenInfo::isNoProtoCallVariadic(args, fnType); } llvm::Constant * getUBSanFunctionSignature(CodeGen::CodeGenModule &CGM) const override { unsigned Sig = (0xeb << 0) | // jmp rel8 (0x0a << 8) | // .+0x0c ('F' << 16) | ('T' << 24); return llvm::ConstantInt::get(CGM.Int32Ty, Sig); } unsigned getOpenMPSimdDefaultAlignment(QualType) const override { return HasAVX ? 32 : 16; } }; static std::string qualifyWindowsLibrary(llvm::StringRef Lib) { // If the argument does not end in .lib, automatically add the suffix. This // matches the behavior of MSVC. std::string ArgStr = Lib; if (!Lib.endswith_lower(".lib")) ArgStr += ".lib"; return ArgStr; } class WinX86_32TargetCodeGenInfo : public X86_32TargetCodeGenInfo { public: WinX86_32TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool d, bool p, bool w, unsigned RegParms) : X86_32TargetCodeGenInfo(CGT, d, p, w, RegParms) {} void getDependentLibraryOption(llvm::StringRef Lib, llvm::SmallString<24> &Opt) const override { Opt = "/DEFAULTLIB:"; Opt += qualifyWindowsLibrary(Lib); } void getDetectMismatchOption(llvm::StringRef Name, llvm::StringRef Value, llvm::SmallString<32> &Opt) const override { Opt = "/FAILIFMISMATCH:\"" + Name.str() + "=" + Value.str() + "\""; } }; class WinX86_64TargetCodeGenInfo : public TargetCodeGenInfo { bool HasAVX; public: WinX86_64TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool HasAVX) : TargetCodeGenInfo(new WinX86_64ABIInfo(CGT)), HasAVX(HasAVX) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override { return 7; } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override { llvm::Value *Eight8 = llvm::ConstantInt::get(CGF.Int8Ty, 8); // 0-15 are the 16 integer registers. // 16 is %rip. AssignToArrayRange(CGF.Builder, Address, Eight8, 0, 16); return false; } void getDependentLibraryOption(llvm::StringRef Lib, llvm::SmallString<24> &Opt) const override { Opt = "/DEFAULTLIB:"; Opt += qualifyWindowsLibrary(Lib); } void getDetectMismatchOption(llvm::StringRef Name, llvm::StringRef Value, llvm::SmallString<32> &Opt) const override { Opt = "/FAILIFMISMATCH:\"" + Name.str() + "=" + Value.str() + "\""; } unsigned getOpenMPSimdDefaultAlignment(QualType) const override { return HasAVX ? 32 : 16; } }; } void X86_64ABIInfo::postMerge(unsigned AggregateSize, Class &Lo, Class &Hi) const { // AMD64-ABI 3.2.3p2: Rule 5. Then a post merger cleanup is done: // // (a) If one of the classes is Memory, the whole argument is passed in // memory. // // (b) If X87UP is not preceded by X87, the whole argument is passed in // memory. // // (c) If the size of the aggregate exceeds two eightbytes and the first // eightbyte isn't SSE or any other eightbyte isn't SSEUP, the whole // argument is passed in memory. NOTE: This is necessary to keep the // ABI working for processors that don't support the __m256 type. // // (d) If SSEUP is not preceded by SSE or SSEUP, it is converted to SSE. // // Some of these are enforced by the merging logic. Others can arise // only with unions; for example: // union { _Complex double; unsigned; } // // Note that clauses (b) and (c) were added in 0.98. // if (Hi == Memory) Lo = Memory; if (Hi == X87Up && Lo != X87 && honorsRevision0_98()) Lo = Memory; if (AggregateSize > 128 && (Lo != SSE || Hi != SSEUp)) Lo = Memory; if (Hi == SSEUp && Lo != SSE) Hi = SSE; } X86_64ABIInfo::Class X86_64ABIInfo::merge(Class Accum, Class Field) { // AMD64-ABI 3.2.3p2: Rule 4. Each field of an object is // classified recursively so that always two fields are // considered. The resulting class is calculated according to // the classes of the fields in the eightbyte: // // (a) If both classes are equal, this is the resulting class. // // (b) If one of the classes is NO_CLASS, the resulting class is // the other class. // // (c) If one of the classes is MEMORY, the result is the MEMORY // class. // // (d) If one of the classes is INTEGER, the result is the // INTEGER. // // (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, // MEMORY is used as class. // // (f) Otherwise class SSE is used. // Accum should never be memory (we should have returned) or // ComplexX87 (because this cannot be passed in a structure). assert((Accum != Memory && Accum != ComplexX87) && "Invalid accumulated classification during merge."); if (Accum == Field || Field == NoClass) return Accum; if (Field == Memory) return Memory; if (Accum == NoClass) return Field; if (Accum == Integer || Field == Integer) return Integer; if (Field == X87 || Field == X87Up || Field == ComplexX87 || Accum == X87 || Accum == X87Up) return Memory; return SSE; } void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo, Class &Hi, bool isNamedArg) const { // FIXME: This code can be simplified by introducing a simple value class for // Class pairs with appropriate constructor methods for the various // situations. // FIXME: Some of the split computations are wrong; unaligned vectors // shouldn't be passed in registers for example, so there is no chance they // can straddle an eightbyte. Verify & simplify. Lo = Hi = NoClass; Class &Current = OffsetBase < 64 ? Lo : Hi; Current = Memory; if (const BuiltinType *BT = Ty->getAs()) { BuiltinType::Kind k = BT->getKind(); if (k == BuiltinType::Void) { Current = NoClass; } else if (k == BuiltinType::Int128 || k == BuiltinType::UInt128) { Lo = Integer; Hi = Integer; } else if (k >= BuiltinType::Bool && k <= BuiltinType::LongLong) { Current = Integer; } else if ((k == BuiltinType::Float || k == BuiltinType::Double) || (k == BuiltinType::LongDouble && getTarget().getTriple().isOSNaCl())) { Current = SSE; } else if (k == BuiltinType::LongDouble) { Lo = X87; Hi = X87Up; } // FIXME: _Decimal32 and _Decimal64 are SSE. // FIXME: _float128 and _Decimal128 are (SSE, SSEUp). return; } if (const EnumType *ET = Ty->getAs()) { // Classify the underlying integer type. classify(ET->getDecl()->getIntegerType(), OffsetBase, Lo, Hi, isNamedArg); return; } if (Ty->hasPointerRepresentation()) { Current = Integer; return; } if (Ty->isMemberPointerType()) { if (Ty->isMemberFunctionPointerType()) { if (Has64BitPointers) { // If Has64BitPointers, this is an {i64, i64}, so classify both // Lo and Hi now. Lo = Hi = Integer; } else { // Otherwise, with 32-bit pointers, this is an {i32, i32}. If that // straddles an eightbyte boundary, Hi should be classified as well. uint64_t EB_FuncPtr = (OffsetBase) / 64; uint64_t EB_ThisAdj = (OffsetBase + 64 - 1) / 64; if (EB_FuncPtr != EB_ThisAdj) { Lo = Hi = Integer; } else { Current = Integer; } } } else { Current = Integer; } return; } if (const VectorType *VT = Ty->getAs()) { uint64_t Size = getContext().getTypeSize(VT); if (Size == 32) { // gcc passes all <4 x char>, <2 x short>, <1 x int>, <1 x // float> as integer. Current = Integer; // If this type crosses an eightbyte boundary, it should be // split. uint64_t EB_Real = (OffsetBase) / 64; uint64_t EB_Imag = (OffsetBase + Size - 1) / 64; if (EB_Real != EB_Imag) Hi = Lo; } else if (Size == 64) { // gcc passes <1 x double> in memory. :( if (VT->getElementType()->isSpecificBuiltinType(BuiltinType::Double)) return; // gcc passes <1 x long long> as INTEGER. if (VT->getElementType()->isSpecificBuiltinType(BuiltinType::LongLong) || VT->getElementType()->isSpecificBuiltinType(BuiltinType::ULongLong) || VT->getElementType()->isSpecificBuiltinType(BuiltinType::Long) || VT->getElementType()->isSpecificBuiltinType(BuiltinType::ULong)) Current = Integer; else Current = SSE; // If this type crosses an eightbyte boundary, it should be // split. if (OffsetBase && OffsetBase != 64) Hi = Lo; } else if (Size == 128 || (HasAVX && isNamedArg && Size == 256)) { // Arguments of 256-bits are split into four eightbyte chunks. The // least significant one belongs to class SSE and all the others to class // SSEUP. The original Lo and Hi design considers that types can't be // greater than 128-bits, so a 64-bit split in Hi and Lo makes sense. // This design isn't correct for 256-bits, but since there're no cases // where the upper parts would need to be inspected, avoid adding // complexity and just consider Hi to match the 64-256 part. // // Note that per 3.5.7 of AMD64-ABI, 256-bit args are only passed in // registers if they are "named", i.e. not part of the "..." of a // variadic function. Lo = SSE; Hi = SSEUp; } return; } if (const ComplexType *CT = Ty->getAs()) { QualType ET = getContext().getCanonicalType(CT->getElementType()); uint64_t Size = getContext().getTypeSize(Ty); if (ET->isIntegralOrEnumerationType()) { if (Size <= 64) Current = Integer; else if (Size <= 128) Lo = Hi = Integer; } else if (ET == getContext().FloatTy) Current = SSE; else if (ET == getContext().DoubleTy || (ET == getContext().LongDoubleTy && getTarget().getTriple().isOSNaCl())) Lo = Hi = SSE; else if (ET == getContext().LongDoubleTy) Current = ComplexX87; // If this complex type crosses an eightbyte boundary then it // should be split. uint64_t EB_Real = (OffsetBase) / 64; uint64_t EB_Imag = (OffsetBase + getContext().getTypeSize(ET)) / 64; if (Hi == NoClass && EB_Real != EB_Imag) Hi = Lo; return; } if (const ConstantArrayType *AT = getContext().getAsConstantArrayType(Ty)) { // Arrays are treated like structures. uint64_t Size = getContext().getTypeSize(Ty); // AMD64-ABI 3.2.3p2: Rule 1. If the size of an object is larger // than four eightbytes, ..., it has class MEMORY. if (Size > 256) return; // AMD64-ABI 3.2.3p2: Rule 1. If ..., or it contains unaligned // fields, it has class MEMORY. // // Only need to check alignment of array base. if (OffsetBase % getContext().getTypeAlign(AT->getElementType())) return; // Otherwise implement simplified merge. We could be smarter about // this, but it isn't worth it and would be harder to verify. Current = NoClass; uint64_t EltSize = getContext().getTypeSize(AT->getElementType()); uint64_t ArraySize = AT->getSize().getZExtValue(); // The only case a 256-bit wide vector could be used is when the array // contains a single 256-bit element. Since Lo and Hi logic isn't extended // to work for sizes wider than 128, early check and fallback to memory. if (Size > 128 && EltSize != 256) return; for (uint64_t i=0, Offset=OffsetBase; igetElementType(), Offset, FieldLo, FieldHi, isNamedArg); Lo = merge(Lo, FieldLo); Hi = merge(Hi, FieldHi); if (Lo == Memory || Hi == Memory) break; } postMerge(Size, Lo, Hi); assert((Hi != SSEUp || Lo == SSE) && "Invalid SSEUp array classification."); return; } if (const RecordType *RT = Ty->getAs()) { uint64_t Size = getContext().getTypeSize(Ty); // AMD64-ABI 3.2.3p2: Rule 1. If the size of an object is larger // than four eightbytes, ..., it has class MEMORY. if (Size > 256) return; // AMD64-ABI 3.2.3p2: Rule 2. If a C++ object has either a non-trivial // copy constructor or a non-trivial destructor, it is passed by invisible // reference. if (getRecordArgABI(RT, getCXXABI())) return; const RecordDecl *RD = RT->getDecl(); // Assume variable sized types are passed in memory. if (RD->hasFlexibleArrayMember()) return; const ASTRecordLayout &Layout = getContext().getASTRecordLayout(RD); // Reset Lo class, this will be recomputed. Current = NoClass; // If this is a C++ record, classify the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) { for (const auto &I : CXXRD->bases()) { assert(!I.isVirtual() && !I.getType()->isDependentType() && "Unexpected base class!"); const CXXRecordDecl *Base = cast(I.getType()->getAs()->getDecl()); // Classify this field. // // AMD64-ABI 3.2.3p2: Rule 3. If the size of the aggregate exceeds a // single eightbyte, each is classified separately. Each eightbyte gets // initialized to class NO_CLASS. Class FieldLo, FieldHi; uint64_t Offset = OffsetBase + getContext().toBits(Layout.getBaseClassOffset(Base)); classify(I.getType(), Offset, FieldLo, FieldHi, isNamedArg); Lo = merge(Lo, FieldLo); Hi = merge(Hi, FieldHi); if (Lo == Memory || Hi == Memory) break; } } // Classify the fields one at a time, merging the results. unsigned idx = 0; for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end(); i != e; ++i, ++idx) { uint64_t Offset = OffsetBase + Layout.getFieldOffset(idx); bool BitField = i->isBitField(); // AMD64-ABI 3.2.3p2: Rule 1. If the size of an object is larger than // four eightbytes, or it contains unaligned fields, it has class MEMORY. // // The only case a 256-bit wide vector could be used is when the struct // contains a single 256-bit element. Since Lo and Hi logic isn't extended // to work for sizes wider than 128, early check and fallback to memory. // if (Size > 128 && getContext().getTypeSize(i->getType()) != 256) { Lo = Memory; return; } // Note, skip this test for bit-fields, see below. if (!BitField && Offset % getContext().getTypeAlign(i->getType())) { Lo = Memory; return; } // Classify this field. // // AMD64-ABI 3.2.3p2: Rule 3. If the size of the aggregate // exceeds a single eightbyte, each is classified // separately. Each eightbyte gets initialized to class // NO_CLASS. Class FieldLo, FieldHi; // Bit-fields require special handling, they do not force the // structure to be passed in memory even if unaligned, and // therefore they can straddle an eightbyte. if (BitField) { // Ignore padding bit-fields. if (i->isUnnamedBitfield()) continue; uint64_t Offset = OffsetBase + Layout.getFieldOffset(idx); uint64_t Size = i->getBitWidthValue(getContext()); uint64_t EB_Lo = Offset / 64; uint64_t EB_Hi = (Offset + Size - 1) / 64; if (EB_Lo) { assert(EB_Hi == EB_Lo && "Invalid classification, type > 16 bytes."); FieldLo = NoClass; FieldHi = Integer; } else { FieldLo = Integer; FieldHi = EB_Hi ? Integer : NoClass; } } else classify(i->getType(), Offset, FieldLo, FieldHi, isNamedArg); Lo = merge(Lo, FieldLo); Hi = merge(Hi, FieldHi); if (Lo == Memory || Hi == Memory) break; } postMerge(Size, Lo, Hi); } } ABIArgInfo X86_64ABIInfo::getIndirectReturnResult(QualType Ty) const { // If this is a scalar LLVM value then assume LLVM will pass it in the right // place naturally. if (!isAggregateTypeForABI(Ty)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } return ABIArgInfo::getIndirect(0); } bool X86_64ABIInfo::IsIllegalVectorType(QualType Ty) const { if (const VectorType *VecTy = Ty->getAs()) { uint64_t Size = getContext().getTypeSize(VecTy); unsigned LargestVector = HasAVX ? 256 : 128; if (Size <= 64 || Size > LargestVector) return true; } return false; } ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty, unsigned freeIntRegs) const { // If this is a scalar LLVM value then assume LLVM will pass it in the right // place naturally. // // This assumption is optimistic, as there could be free registers available // when we need to pass this argument in memory, and LLVM could try to pass // the argument in the free register. This does not seem to happen currently, // but this code would be much safer if we could mark the argument with // 'onstack'. See PR12193. if (!isAggregateTypeForABI(Ty) && !IsIllegalVectorType(Ty)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); // Compute the byval alignment. We specify the alignment of the byval in all // cases so that the mid-level optimizer knows the alignment of the byval. unsigned Align = std::max(getContext().getTypeAlign(Ty) / 8, 8U); // Attempt to avoid passing indirect results using byval when possible. This // is important for good codegen. // // We do this by coercing the value into a scalar type which the backend can // handle naturally (i.e., without using byval). // // For simplicity, we currently only do this when we have exhausted all of the // free integer registers. Doing this when there are free integer registers // would require more care, as we would have to ensure that the coerced value // did not claim the unused register. That would require either reording the // arguments to the function (so that any subsequent inreg values came first), // or only doing this optimization when there were no following arguments that // might be inreg. // // We currently expect it to be rare (particularly in well written code) for // arguments to be passed on the stack when there are still free integer // registers available (this would typically imply large structs being passed // by value), so this seems like a fair tradeoff for now. // // We can revisit this if the backend grows support for 'onstack' parameter // attributes. See PR12193. if (freeIntRegs == 0) { uint64_t Size = getContext().getTypeSize(Ty); // If this type fits in an eightbyte, coerce it into the matching integral // type, which will end up on the stack (with alignment 8). if (Align == 8 && Size <= 64) return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size)); } return ABIArgInfo::getIndirect(Align); } -/// GetByteVectorType - The ABI specifies that a value should be passed in an -/// full vector XMM/YMM register. Pick an LLVM IR type that will be passed as a -/// vector register. +/// The ABI specifies that a value should be passed in a full vector XMM/YMM +/// register. Pick an LLVM IR type that will be passed as a vector register. llvm::Type *X86_64ABIInfo::GetByteVectorType(QualType Ty) const { - llvm::Type *IRType = CGT.ConvertType(Ty); + // Wrapper structs/arrays that only contain vectors are passed just like + // vectors; strip them off if present. + if (const Type *InnerTy = isSingleElementStruct(Ty, getContext())) + Ty = QualType(InnerTy, 0); - // Wrapper structs that just contain vectors are passed just like vectors, - // strip them off if present. - llvm::StructType *STy = dyn_cast(IRType); - while (STy && STy->getNumElements() == 1) { - IRType = STy->getElementType(0); - STy = dyn_cast(IRType); - } + llvm::Type *IRType = CGT.ConvertType(Ty); // If the preferred type is a 16-byte vector, prefer to pass it. if (llvm::VectorType *VT = dyn_cast(IRType)){ llvm::Type *EltTy = VT->getElementType(); unsigned BitWidth = VT->getBitWidth(); if ((BitWidth >= 128 && BitWidth <= 256) && (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(8) || EltTy->isIntegerTy(16) || EltTy->isIntegerTy(32) || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(128))) return VT; } return llvm::VectorType::get(llvm::Type::getDoubleTy(getVMContext()), 2); } /// BitsContainNoUserData - Return true if the specified [start,end) bit range /// is known to either be off the end of the specified type or being in /// alignment padding. The user type specified is known to be at most 128 bits /// in size, and have passed through X86_64ABIInfo::classify with a successful /// classification that put one of the two halves in the INTEGER class. /// /// It is conservatively correct to return false. static bool BitsContainNoUserData(QualType Ty, unsigned StartBit, unsigned EndBit, ASTContext &Context) { // If the bytes being queried are off the end of the type, there is no user // data hiding here. This handles analysis of builtins, vectors and other // types that don't contain interesting padding. unsigned TySize = (unsigned)Context.getTypeSize(Ty); if (TySize <= StartBit) return true; if (const ConstantArrayType *AT = Context.getAsConstantArrayType(Ty)) { unsigned EltSize = (unsigned)Context.getTypeSize(AT->getElementType()); unsigned NumElts = (unsigned)AT->getSize().getZExtValue(); // Check each element to see if the element overlaps with the queried range. for (unsigned i = 0; i != NumElts; ++i) { // If the element is after the span we care about, then we're done.. unsigned EltOffset = i*EltSize; if (EltOffset >= EndBit) break; unsigned EltStart = EltOffset < StartBit ? StartBit-EltOffset :0; if (!BitsContainNoUserData(AT->getElementType(), EltStart, EndBit-EltOffset, Context)) return false; } // If it overlaps no elements, then it is safe to process as padding. return true; } if (const RecordType *RT = Ty->getAs()) { const RecordDecl *RD = RT->getDecl(); const ASTRecordLayout &Layout = Context.getASTRecordLayout(RD); // If this is a C++ record, check the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) { for (const auto &I : CXXRD->bases()) { assert(!I.isVirtual() && !I.getType()->isDependentType() && "Unexpected base class!"); const CXXRecordDecl *Base = cast(I.getType()->getAs()->getDecl()); // If the base is after the span we care about, ignore it. unsigned BaseOffset = Context.toBits(Layout.getBaseClassOffset(Base)); if (BaseOffset >= EndBit) continue; unsigned BaseStart = BaseOffset < StartBit ? StartBit-BaseOffset :0; if (!BitsContainNoUserData(I.getType(), BaseStart, EndBit-BaseOffset, Context)) return false; } } // Verify that no field has data that overlaps the region of interest. Yes // this could be sped up a lot by being smarter about queried fields, // however we're only looking at structs up to 16 bytes, so we don't care // much. unsigned idx = 0; for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end(); i != e; ++i, ++idx) { unsigned FieldOffset = (unsigned)Layout.getFieldOffset(idx); // If we found a field after the region we care about, then we're done. if (FieldOffset >= EndBit) break; unsigned FieldStart = FieldOffset < StartBit ? StartBit-FieldOffset :0; if (!BitsContainNoUserData(i->getType(), FieldStart, EndBit-FieldOffset, Context)) return false; } // If nothing in this record overlapped the area of interest, then we're // clean. return true; } return false; } /// ContainsFloatAtOffset - Return true if the specified LLVM IR type has a /// float member at the specified offset. For example, {int,{float}} has a /// float at offset 4. It is conservatively correct for this routine to return /// false. static bool ContainsFloatAtOffset(llvm::Type *IRType, unsigned IROffset, const llvm::DataLayout &TD) { // Base case if we find a float. if (IROffset == 0 && IRType->isFloatTy()) return true; // If this is a struct, recurse into the field at the specified offset. if (llvm::StructType *STy = dyn_cast(IRType)) { const llvm::StructLayout *SL = TD.getStructLayout(STy); unsigned Elt = SL->getElementContainingOffset(IROffset); IROffset -= SL->getElementOffset(Elt); return ContainsFloatAtOffset(STy->getElementType(Elt), IROffset, TD); } // If this is an array, recurse into the field at the specified offset. if (llvm::ArrayType *ATy = dyn_cast(IRType)) { llvm::Type *EltTy = ATy->getElementType(); unsigned EltSize = TD.getTypeAllocSize(EltTy); IROffset -= IROffset/EltSize*EltSize; return ContainsFloatAtOffset(EltTy, IROffset, TD); } return false; } /// GetSSETypeAtOffset - Return a type that will be passed by the backend in the /// low 8 bytes of an XMM register, corresponding to the SSE class. llvm::Type *X86_64ABIInfo:: GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset, QualType SourceTy, unsigned SourceOffset) const { // The only three choices we have are either double, <2 x float>, or float. We // pass as float if the last 4 bytes is just padding. This happens for // structs that contain 3 floats. if (BitsContainNoUserData(SourceTy, SourceOffset*8+32, SourceOffset*8+64, getContext())) return llvm::Type::getFloatTy(getVMContext()); // We want to pass as <2 x float> if the LLVM IR type contains a float at // offset+0 and offset+4. Walk the LLVM IR type to find out if this is the // case. if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()) && ContainsFloatAtOffset(IRType, IROffset+4, getDataLayout())) return llvm::VectorType::get(llvm::Type::getFloatTy(getVMContext()), 2); return llvm::Type::getDoubleTy(getVMContext()); } /// GetINTEGERTypeAtOffset - The ABI specifies that a value should be passed in /// an 8-byte GPR. This means that we either have a scalar or we are talking /// about the high or low part of an up-to-16-byte struct. This routine picks /// the best LLVM IR type to represent this, which may be i64 or may be anything /// else that the backend will pass in a GPR that works better (e.g. i8, %foo*, /// etc). /// /// PrefType is an LLVM IR type that corresponds to (part of) the IR type for /// the source type. IROffset is an offset in bytes into the LLVM IR type that /// the 8-byte value references. PrefType may be null. /// /// SourceTy is the source-level type for the entire argument. SourceOffset is /// an offset into this that we're processing (which is always either 0 or 8). /// llvm::Type *X86_64ABIInfo:: GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset, QualType SourceTy, unsigned SourceOffset) const { // If we're dealing with an un-offset LLVM IR type, then it means that we're // returning an 8-byte unit starting with it. See if we can safely use it. if (IROffset == 0) { // Pointers and int64's always fill the 8-byte unit. if ((isa(IRType) && Has64BitPointers) || IRType->isIntegerTy(64)) return IRType; // If we have a 1/2/4-byte integer, we can use it only if the rest of the // goodness in the source type is just tail padding. This is allowed to // kick in for struct {double,int} on the int, but not on // struct{double,int,int} because we wouldn't return the second int. We // have to do this analysis on the source type because we can't depend on // unions being lowered a specific way etc. if (IRType->isIntegerTy(8) || IRType->isIntegerTy(16) || IRType->isIntegerTy(32) || (isa(IRType) && !Has64BitPointers)) { unsigned BitWidth = isa(IRType) ? 32 : cast(IRType)->getBitWidth(); if (BitsContainNoUserData(SourceTy, SourceOffset*8+BitWidth, SourceOffset*8+64, getContext())) return IRType; } } if (llvm::StructType *STy = dyn_cast(IRType)) { // If this is a struct, recurse into the field at the specified offset. const llvm::StructLayout *SL = getDataLayout().getStructLayout(STy); if (IROffset < SL->getSizeInBytes()) { unsigned FieldIdx = SL->getElementContainingOffset(IROffset); IROffset -= SL->getElementOffset(FieldIdx); return GetINTEGERTypeAtOffset(STy->getElementType(FieldIdx), IROffset, SourceTy, SourceOffset); } } if (llvm::ArrayType *ATy = dyn_cast(IRType)) { llvm::Type *EltTy = ATy->getElementType(); unsigned EltSize = getDataLayout().getTypeAllocSize(EltTy); unsigned EltOffset = IROffset/EltSize*EltSize; return GetINTEGERTypeAtOffset(EltTy, IROffset-EltOffset, SourceTy, SourceOffset); } // Okay, we don't have any better idea of what to pass, so we pass this in an // integer register that isn't too big to fit the rest of the struct. unsigned TySizeInBytes = (unsigned)getContext().getTypeSizeInChars(SourceTy).getQuantity(); assert(TySizeInBytes != SourceOffset && "Empty field?"); // It is always safe to classify this as an integer type up to i64 that // isn't larger than the structure. return llvm::IntegerType::get(getVMContext(), std::min(TySizeInBytes-SourceOffset, 8U)*8); } /// GetX86_64ByValArgumentPair - Given a high and low type that can ideally /// be used as elements of a two register pair to pass or return, return a /// first class aggregate to represent them. For example, if the low part of /// a by-value argument should be passed as i32* and the high part as float, /// return {i32*, float}. static llvm::Type * GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi, const llvm::DataLayout &TD) { // In order to correctly satisfy the ABI, we need to the high part to start // at offset 8. If the high and low parts we inferred are both 4-byte types // (e.g. i32 and i32) then the resultant struct type ({i32,i32}) won't have // the second element at offset 8. Check for this: unsigned LoSize = (unsigned)TD.getTypeAllocSize(Lo); unsigned HiAlign = TD.getABITypeAlignment(Hi); unsigned HiStart = llvm::RoundUpToAlignment(LoSize, HiAlign); assert(HiStart != 0 && HiStart <= 8 && "Invalid x86-64 argument pair!"); // To handle this, we have to increase the size of the low part so that the // second element will start at an 8 byte offset. We can't increase the size // of the second element because it might make us access off the end of the // struct. if (HiStart != 8) { // There are only two sorts of types the ABI generation code can produce for // the low part of a pair that aren't 8 bytes in size: float or i8/i16/i32. // Promote these to a larger type. if (Lo->isFloatTy()) Lo = llvm::Type::getDoubleTy(Lo->getContext()); else { assert(Lo->isIntegerTy() && "Invalid/unknown lo type"); Lo = llvm::Type::getInt64Ty(Lo->getContext()); } } llvm::StructType *Result = llvm::StructType::get(Lo, Hi, nullptr); // Verify that the second element is at an 8-byte offset. assert(TD.getStructLayout(Result)->getElementOffset(1) == 8 && "Invalid x86-64 argument pair!"); return Result; } ABIArgInfo X86_64ABIInfo:: classifyReturnType(QualType RetTy) const { // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the // classification algorithm. X86_64ABIInfo::Class Lo, Hi; classify(RetTy, 0, Lo, Hi, /*isNamedArg*/ true); // Check some invariants. assert((Hi != Memory || Lo == Memory) && "Invalid memory classification."); assert((Hi != SSEUp || Lo == SSE) && "Invalid SSEUp classification."); llvm::Type *ResType = nullptr; switch (Lo) { case NoClass: if (Hi == NoClass) return ABIArgInfo::getIgnore(); // If the low part is just padding, it takes no register, leave ResType // null. assert((Hi == SSE || Hi == Integer || Hi == X87Up) && "Unknown missing lo part"); break; case SSEUp: case X87Up: llvm_unreachable("Invalid classification for lo word."); // AMD64-ABI 3.2.3p4: Rule 2. Types of class memory are returned via // hidden argument. case Memory: return getIndirectReturnResult(RetTy); // AMD64-ABI 3.2.3p4: Rule 3. If the class is INTEGER, the next // available register of the sequence %rax, %rdx is used. case Integer: ResType = GetINTEGERTypeAtOffset(CGT.ConvertType(RetTy), 0, RetTy, 0); // If we have a sign or zero extended integer, make sure to return Extend // so that the parameter gets the right LLVM IR attributes. if (Hi == NoClass && isa(ResType)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); if (RetTy->isIntegralOrEnumerationType() && RetTy->isPromotableIntegerType()) return ABIArgInfo::getExtend(); } break; // AMD64-ABI 3.2.3p4: Rule 4. If the class is SSE, the next // available SSE register of the sequence %xmm0, %xmm1 is used. case SSE: ResType = GetSSETypeAtOffset(CGT.ConvertType(RetTy), 0, RetTy, 0); break; // AMD64-ABI 3.2.3p4: Rule 6. If the class is X87, the value is // returned on the X87 stack in %st0 as 80-bit x87 number. case X87: ResType = llvm::Type::getX86_FP80Ty(getVMContext()); break; // AMD64-ABI 3.2.3p4: Rule 8. If the class is COMPLEX_X87, the real // part of the value is returned in %st0 and the imaginary part in // %st1. case ComplexX87: assert(Hi == ComplexX87 && "Unexpected ComplexX87 classification."); ResType = llvm::StructType::get(llvm::Type::getX86_FP80Ty(getVMContext()), llvm::Type::getX86_FP80Ty(getVMContext()), nullptr); break; } llvm::Type *HighPart = nullptr; switch (Hi) { // Memory was handled previously and X87 should // never occur as a hi class. case Memory: case X87: llvm_unreachable("Invalid classification for hi word."); case ComplexX87: // Previously handled. case NoClass: break; case Integer: HighPart = GetINTEGERTypeAtOffset(CGT.ConvertType(RetTy), 8, RetTy, 8); if (Lo == NoClass) // Return HighPart at offset 8 in memory. return ABIArgInfo::getDirect(HighPart, 8); break; case SSE: HighPart = GetSSETypeAtOffset(CGT.ConvertType(RetTy), 8, RetTy, 8); if (Lo == NoClass) // Return HighPart at offset 8 in memory. return ABIArgInfo::getDirect(HighPart, 8); break; // AMD64-ABI 3.2.3p4: Rule 5. If the class is SSEUP, the eightbyte // is passed in the next available eightbyte chunk if the last used // vector register. // // SSEUP should always be preceded by SSE, just widen. case SSEUp: assert(Lo == SSE && "Unexpected SSEUp classification."); ResType = GetByteVectorType(RetTy); break; // AMD64-ABI 3.2.3p4: Rule 7. If the class is X87UP, the value is // returned together with the previous X87 value in %st0. case X87Up: // If X87Up is preceded by X87, we don't need to do // anything. However, in some cases with unions it may not be // preceded by X87. In such situations we follow gcc and pass the // extra bits in an SSE reg. if (Lo != X87) { HighPart = GetSSETypeAtOffset(CGT.ConvertType(RetTy), 8, RetTy, 8); if (Lo == NoClass) // Return HighPart at offset 8 in memory. return ABIArgInfo::getDirect(HighPart, 8); } break; } // If a high part was specified, merge it together with the low part. It is // known to pass in the high eightbyte of the result. We do this by forming a // first class struct aggregate with the high and low part: {low, high} if (HighPart) ResType = GetX86_64ByValArgumentPair(ResType, HighPart, getDataLayout()); return ABIArgInfo::getDirect(ResType); } ABIArgInfo X86_64ABIInfo::classifyArgumentType( QualType Ty, unsigned freeIntRegs, unsigned &neededInt, unsigned &neededSSE, bool isNamedArg) const { Ty = useFirstFieldIfTransparentUnion(Ty); X86_64ABIInfo::Class Lo, Hi; classify(Ty, 0, Lo, Hi, isNamedArg); // Check some invariants. // FIXME: Enforce these by construction. assert((Hi != Memory || Lo == Memory) && "Invalid memory classification."); assert((Hi != SSEUp || Lo == SSE) && "Invalid SSEUp classification."); neededInt = 0; neededSSE = 0; llvm::Type *ResType = nullptr; switch (Lo) { case NoClass: if (Hi == NoClass) return ABIArgInfo::getIgnore(); // If the low part is just padding, it takes no register, leave ResType // null. assert((Hi == SSE || Hi == Integer || Hi == X87Up) && "Unknown missing lo part"); break; // AMD64-ABI 3.2.3p3: Rule 1. If the class is MEMORY, pass the argument // on the stack. case Memory: // AMD64-ABI 3.2.3p3: Rule 5. If the class is X87, X87UP or // COMPLEX_X87, it is passed in memory. case X87: case ComplexX87: if (getRecordArgABI(Ty, getCXXABI()) == CGCXXABI::RAA_Indirect) ++neededInt; return getIndirectResult(Ty, freeIntRegs); case SSEUp: case X87Up: llvm_unreachable("Invalid classification for lo word."); // AMD64-ABI 3.2.3p3: Rule 2. If the class is INTEGER, the next // available register of the sequence %rdi, %rsi, %rdx, %rcx, %r8 // and %r9 is used. case Integer: ++neededInt; // Pick an 8-byte type based on the preferred type. ResType = GetINTEGERTypeAtOffset(CGT.ConvertType(Ty), 0, Ty, 0); // If we have a sign or zero extended integer, make sure to return Extend // so that the parameter gets the right LLVM IR attributes. if (Hi == NoClass && isa(ResType)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); if (Ty->isIntegralOrEnumerationType() && Ty->isPromotableIntegerType()) return ABIArgInfo::getExtend(); } break; // AMD64-ABI 3.2.3p3: Rule 3. If the class is SSE, the next // available SSE register is used, the registers are taken in the // order from %xmm0 to %xmm7. case SSE: { llvm::Type *IRType = CGT.ConvertType(Ty); ResType = GetSSETypeAtOffset(IRType, 0, Ty, 0); ++neededSSE; break; } } llvm::Type *HighPart = nullptr; switch (Hi) { // Memory was handled previously, ComplexX87 and X87 should // never occur as hi classes, and X87Up must be preceded by X87, // which is passed in memory. case Memory: case X87: case ComplexX87: llvm_unreachable("Invalid classification for hi word."); case NoClass: break; case Integer: ++neededInt; // Pick an 8-byte type based on the preferred type. HighPart = GetINTEGERTypeAtOffset(CGT.ConvertType(Ty), 8, Ty, 8); if (Lo == NoClass) // Pass HighPart at offset 8 in memory. return ABIArgInfo::getDirect(HighPart, 8); break; // X87Up generally doesn't occur here (long double is passed in // memory), except in situations involving unions. case X87Up: case SSE: HighPart = GetSSETypeAtOffset(CGT.ConvertType(Ty), 8, Ty, 8); if (Lo == NoClass) // Pass HighPart at offset 8 in memory. return ABIArgInfo::getDirect(HighPart, 8); ++neededSSE; break; // AMD64-ABI 3.2.3p3: Rule 4. If the class is SSEUP, the // eightbyte is passed in the upper half of the last used SSE // register. This only happens when 128-bit vectors are passed. case SSEUp: assert(Lo == SSE && "Unexpected SSEUp classification"); ResType = GetByteVectorType(Ty); break; } // If a high part was specified, merge it together with the low part. It is // known to pass in the high eightbyte of the result. We do this by forming a // first class struct aggregate with the high and low part: {low, high} if (HighPart) ResType = GetX86_64ByValArgumentPair(ResType, HighPart, getDataLayout()); return ABIArgInfo::getDirect(ResType); } void X86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); // Keep track of the number of assigned registers. unsigned freeIntRegs = 6, freeSSERegs = 8; // If the return value is indirect, then the hidden argument is consuming one // integer register. if (FI.getReturnInfo().isIndirect()) --freeIntRegs; // The chain argument effectively gives us another free register. if (FI.isChainCall()) ++freeIntRegs; unsigned NumRequiredArgs = FI.getNumRequiredArgs(); // AMD64-ABI 3.2.3p3: Once arguments are classified, the registers // get assigned (in left-to-right order) for passing as follows... unsigned ArgNo = 0; for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end(); it != ie; ++it, ++ArgNo) { bool IsNamedArg = ArgNo < NumRequiredArgs; unsigned neededInt, neededSSE; it->info = classifyArgumentType(it->type, freeIntRegs, neededInt, neededSSE, IsNamedArg); // AMD64-ABI 3.2.3p3: If there are no registers available for any // eightbyte of an argument, the whole argument is passed on the // stack. If registers have already been assigned for some // eightbytes of such an argument, the assignments get reverted. if (freeIntRegs >= neededInt && freeSSERegs >= neededSSE) { freeIntRegs -= neededInt; freeSSERegs -= neededSSE; } else { it->info = getIndirectResult(it->type, freeIntRegs); } } } static llvm::Value *EmitVAArgFromMemory(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) { llvm::Value *overflow_arg_area_p = CGF.Builder.CreateStructGEP(VAListAddr, 2, "overflow_arg_area_p"); llvm::Value *overflow_arg_area = CGF.Builder.CreateLoad(overflow_arg_area_p, "overflow_arg_area"); // AMD64-ABI 3.5.7p5: Step 7. Align l->overflow_arg_area upwards to a 16 // byte boundary if alignment needed by type exceeds 8 byte boundary. // It isn't stated explicitly in the standard, but in practice we use // alignment greater than 16 where necessary. uint64_t Align = CGF.getContext().getTypeAlign(Ty) / 8; if (Align > 8) { // overflow_arg_area = (overflow_arg_area + align - 1) & -align; llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int64Ty, Align - 1); overflow_arg_area = CGF.Builder.CreateGEP(overflow_arg_area, Offset); llvm::Value *AsInt = CGF.Builder.CreatePtrToInt(overflow_arg_area, CGF.Int64Ty); llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, -(uint64_t)Align); overflow_arg_area = CGF.Builder.CreateIntToPtr(CGF.Builder.CreateAnd(AsInt, Mask), overflow_arg_area->getType(), "overflow_arg_area.align"); } // AMD64-ABI 3.5.7p5: Step 8. Fetch type from l->overflow_arg_area. llvm::Type *LTy = CGF.ConvertTypeForMem(Ty); llvm::Value *Res = CGF.Builder.CreateBitCast(overflow_arg_area, llvm::PointerType::getUnqual(LTy)); // AMD64-ABI 3.5.7p5: Step 9. Set l->overflow_arg_area to: // l->overflow_arg_area + sizeof(type). // AMD64-ABI 3.5.7p5: Step 10. Align l->overflow_arg_area upwards to // an 8 byte boundary. uint64_t SizeInBytes = (CGF.getContext().getTypeSize(Ty) + 7) / 8; llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int32Ty, (SizeInBytes + 7) & ~7); overflow_arg_area = CGF.Builder.CreateGEP(overflow_arg_area, Offset, "overflow_arg_area.next"); CGF.Builder.CreateStore(overflow_arg_area, overflow_arg_area_p); // AMD64-ABI 3.5.7p5: Step 11. Return the fetched type. return Res; } llvm::Value *X86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { // Assume that va_list type is correct; should be pointer to LLVM type: // struct { // i32 gp_offset; // i32 fp_offset; // i8* overflow_arg_area; // i8* reg_save_area; // }; unsigned neededInt, neededSSE; Ty = CGF.getContext().getCanonicalType(Ty); ABIArgInfo AI = classifyArgumentType(Ty, 0, neededInt, neededSSE, /*isNamedArg*/false); // AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed // in the registers. If not go to step 7. if (!neededInt && !neededSSE) return EmitVAArgFromMemory(VAListAddr, Ty, CGF); // AMD64-ABI 3.5.7p5: Step 2. Compute num_gp to hold the number of // general purpose registers needed to pass type and num_fp to hold // the number of floating point registers needed. // AMD64-ABI 3.5.7p5: Step 3. Verify whether arguments fit into // registers. In the case: l->gp_offset > 48 - num_gp * 8 or // l->fp_offset > 304 - num_fp * 16 go to step 7. // // NOTE: 304 is a typo, there are (6 * 8 + 8 * 16) = 176 bytes of // register save space). llvm::Value *InRegs = nullptr; llvm::Value *gp_offset_p = nullptr, *gp_offset = nullptr; llvm::Value *fp_offset_p = nullptr, *fp_offset = nullptr; if (neededInt) { gp_offset_p = CGF.Builder.CreateStructGEP(VAListAddr, 0, "gp_offset_p"); gp_offset = CGF.Builder.CreateLoad(gp_offset_p, "gp_offset"); InRegs = llvm::ConstantInt::get(CGF.Int32Ty, 48 - neededInt * 8); InRegs = CGF.Builder.CreateICmpULE(gp_offset, InRegs, "fits_in_gp"); } if (neededSSE) { fp_offset_p = CGF.Builder.CreateStructGEP(VAListAddr, 1, "fp_offset_p"); fp_offset = CGF.Builder.CreateLoad(fp_offset_p, "fp_offset"); llvm::Value *FitsInFP = llvm::ConstantInt::get(CGF.Int32Ty, 176 - neededSSE * 16); FitsInFP = CGF.Builder.CreateICmpULE(fp_offset, FitsInFP, "fits_in_fp"); InRegs = InRegs ? CGF.Builder.CreateAnd(InRegs, FitsInFP) : FitsInFP; } llvm::BasicBlock *InRegBlock = CGF.createBasicBlock("vaarg.in_reg"); llvm::BasicBlock *InMemBlock = CGF.createBasicBlock("vaarg.in_mem"); llvm::BasicBlock *ContBlock = CGF.createBasicBlock("vaarg.end"); CGF.Builder.CreateCondBr(InRegs, InRegBlock, InMemBlock); // Emit code to load the value if it was passed in registers. CGF.EmitBlock(InRegBlock); // AMD64-ABI 3.5.7p5: Step 4. Fetch type from l->reg_save_area with // an offset of l->gp_offset and/or l->fp_offset. This may require // copying to a temporary location in case the parameter is passed // in different register classes or requires an alignment greater // than 8 for general purpose registers and 16 for XMM registers. // // FIXME: This really results in shameful code when we end up needing to // collect arguments from different places; often what should result in a // simple assembling of a structure from scattered addresses has many more // loads than necessary. Can we clean this up? llvm::Type *LTy = CGF.ConvertTypeForMem(Ty); llvm::Value *RegAddr = CGF.Builder.CreateLoad(CGF.Builder.CreateStructGEP(VAListAddr, 3), "reg_save_area"); if (neededInt && neededSSE) { // FIXME: Cleanup. assert(AI.isDirect() && "Unexpected ABI info for mixed regs"); llvm::StructType *ST = cast(AI.getCoerceToType()); llvm::Value *Tmp = CGF.CreateMemTemp(Ty); Tmp = CGF.Builder.CreateBitCast(Tmp, ST->getPointerTo()); assert(ST->getNumElements() == 2 && "Unexpected ABI info for mixed regs"); llvm::Type *TyLo = ST->getElementType(0); llvm::Type *TyHi = ST->getElementType(1); assert((TyLo->isFPOrFPVectorTy() ^ TyHi->isFPOrFPVectorTy()) && "Unexpected ABI info for mixed regs"); llvm::Type *PTyLo = llvm::PointerType::getUnqual(TyLo); llvm::Type *PTyHi = llvm::PointerType::getUnqual(TyHi); llvm::Value *GPAddr = CGF.Builder.CreateGEP(RegAddr, gp_offset); llvm::Value *FPAddr = CGF.Builder.CreateGEP(RegAddr, fp_offset); llvm::Value *RegLoAddr = TyLo->isFPOrFPVectorTy() ? FPAddr : GPAddr; llvm::Value *RegHiAddr = TyLo->isFPOrFPVectorTy() ? GPAddr : FPAddr; llvm::Value *V = CGF.Builder.CreateLoad(CGF.Builder.CreateBitCast(RegLoAddr, PTyLo)); CGF.Builder.CreateStore(V, CGF.Builder.CreateStructGEP(Tmp, 0)); V = CGF.Builder.CreateLoad(CGF.Builder.CreateBitCast(RegHiAddr, PTyHi)); CGF.Builder.CreateStore(V, CGF.Builder.CreateStructGEP(Tmp, 1)); RegAddr = CGF.Builder.CreateBitCast(Tmp, llvm::PointerType::getUnqual(LTy)); } else if (neededInt) { RegAddr = CGF.Builder.CreateGEP(RegAddr, gp_offset); RegAddr = CGF.Builder.CreateBitCast(RegAddr, llvm::PointerType::getUnqual(LTy)); // Copy to a temporary if necessary to ensure the appropriate alignment. std::pair SizeAlign = CGF.getContext().getTypeInfoInChars(Ty); uint64_t TySize = SizeAlign.first.getQuantity(); unsigned TyAlign = SizeAlign.second.getQuantity(); if (TyAlign > 8) { llvm::Value *Tmp = CGF.CreateMemTemp(Ty); CGF.Builder.CreateMemCpy(Tmp, RegAddr, TySize, 8, false); RegAddr = Tmp; } } else if (neededSSE == 1) { RegAddr = CGF.Builder.CreateGEP(RegAddr, fp_offset); RegAddr = CGF.Builder.CreateBitCast(RegAddr, llvm::PointerType::getUnqual(LTy)); } else { assert(neededSSE == 2 && "Invalid number of needed registers!"); // SSE registers are spaced 16 bytes apart in the register save // area, we need to collect the two eightbytes together. llvm::Value *RegAddrLo = CGF.Builder.CreateGEP(RegAddr, fp_offset); llvm::Value *RegAddrHi = CGF.Builder.CreateConstGEP1_32(RegAddrLo, 16); llvm::Type *DoubleTy = CGF.DoubleTy; llvm::Type *DblPtrTy = llvm::PointerType::getUnqual(DoubleTy); llvm::StructType *ST = llvm::StructType::get(DoubleTy, DoubleTy, nullptr); llvm::Value *V, *Tmp = CGF.CreateMemTemp(Ty); Tmp = CGF.Builder.CreateBitCast(Tmp, ST->getPointerTo()); V = CGF.Builder.CreateLoad(CGF.Builder.CreateBitCast(RegAddrLo, DblPtrTy)); CGF.Builder.CreateStore(V, CGF.Builder.CreateStructGEP(Tmp, 0)); V = CGF.Builder.CreateLoad(CGF.Builder.CreateBitCast(RegAddrHi, DblPtrTy)); CGF.Builder.CreateStore(V, CGF.Builder.CreateStructGEP(Tmp, 1)); RegAddr = CGF.Builder.CreateBitCast(Tmp, llvm::PointerType::getUnqual(LTy)); } // AMD64-ABI 3.5.7p5: Step 5. Set: // l->gp_offset = l->gp_offset + num_gp * 8 // l->fp_offset = l->fp_offset + num_fp * 16. if (neededInt) { llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int32Ty, neededInt * 8); CGF.Builder.CreateStore(CGF.Builder.CreateAdd(gp_offset, Offset), gp_offset_p); } if (neededSSE) { llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int32Ty, neededSSE * 16); CGF.Builder.CreateStore(CGF.Builder.CreateAdd(fp_offset, Offset), fp_offset_p); } CGF.EmitBranch(ContBlock); // Emit code to load the value if it was passed in memory. CGF.EmitBlock(InMemBlock); llvm::Value *MemAddr = EmitVAArgFromMemory(VAListAddr, Ty, CGF); // Return the appropriate result. CGF.EmitBlock(ContBlock); llvm::PHINode *ResAddr = CGF.Builder.CreatePHI(RegAddr->getType(), 2, "vaarg.addr"); ResAddr->addIncoming(RegAddr, InRegBlock); ResAddr->addIncoming(MemAddr, InMemBlock); return ResAddr; } ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs, bool IsReturnType) const { if (Ty->isVoidType()) return ABIArgInfo::getIgnore(); if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); TypeInfo Info = getContext().getTypeInfo(Ty); uint64_t Width = Info.Width; unsigned Align = getContext().toCharUnitsFromBits(Info.Align).getQuantity(); const RecordType *RT = Ty->getAs(); if (RT) { if (!IsReturnType) { if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); } if (RT->getDecl()->hasFlexibleArrayMember()) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // FIXME: mingw-w64-gcc emits 128-bit struct as i128 if (Width == 128 && getTarget().getTriple().isWindowsGNUEnvironment()) return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width)); } // vectorcall adds the concept of a homogenous vector aggregate, similar to // other targets. const Type *Base = nullptr; uint64_t NumElts = 0; if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) { if (FreeSSERegs >= NumElts) { FreeSSERegs -= NumElts; if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType()) return ABIArgInfo::getDirect(); return ABIArgInfo::getExpand(); } return ABIArgInfo::getIndirect(Align, /*ByVal=*/false); } if (Ty->isMemberPointerType()) { // If the member pointer is represented by an LLVM int or ptr, pass it // directly. llvm::Type *LLTy = CGT.ConvertType(Ty); if (LLTy->isPointerTy() || LLTy->isIntegerTy()) return ABIArgInfo::getDirect(); } if (RT || Ty->isMemberPointerType()) { // MS x64 ABI requirement: "Any argument that doesn't fit in 8 bytes, or is // not 1, 2, 4, or 8 bytes, must be passed by reference." if (Width > 64 || !llvm::isPowerOf2_64(Width)) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // Otherwise, coerce it to a small integer. return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width)); } // Bool type is always extended to the ABI, other builtin types are not // extended. const BuiltinType *BT = Ty->getAs(); if (BT && BT->getKind() == BuiltinType::Bool) return ABIArgInfo::getExtend(); return ABIArgInfo::getDirect(); } void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const { bool IsVectorCall = FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall; // We can use up to 4 SSE return registers with vectorcall. unsigned FreeSSERegs = IsVectorCall ? 4 : 0; if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true); // We can use up to 6 SSE register parameters with vectorcall. FreeSSERegs = IsVectorCall ? 6 : 0; for (auto &I : FI.arguments()) I.info = classify(I.type, FreeSSERegs, false); } llvm::Value *WinX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { llvm::Type *BPP = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy); uint64_t Offset = llvm::RoundUpToAlignment(CGF.getContext().getTypeSize(Ty) / 8, 8); llvm::Value *NextAddr = Builder.CreateGEP(Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); return AddrTyped; } namespace { class NaClX86_64ABIInfo : public ABIInfo { public: NaClX86_64ABIInfo(CodeGen::CodeGenTypes &CGT, bool HasAVX) : ABIInfo(CGT), PInfo(CGT), NInfo(CGT, HasAVX) {} void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; private: PNaClABIInfo PInfo; // Used for generating calls with pnaclcall callingconv. X86_64ABIInfo NInfo; // Used for everything else. }; class NaClX86_64TargetCodeGenInfo : public TargetCodeGenInfo { bool HasAVX; public: NaClX86_64TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool HasAVX) : TargetCodeGenInfo(new NaClX86_64ABIInfo(CGT, HasAVX)), HasAVX(HasAVX) { } unsigned getOpenMPSimdDefaultAlignment(QualType) const override { return HasAVX ? 32 : 16; } }; } void NaClX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const { if (FI.getASTCallingConvention() == CC_PnaclCall) PInfo.computeInfo(FI); else NInfo.computeInfo(FI); } llvm::Value *NaClX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { // Always use the native convention; calling pnacl-style varargs functions // is unuspported. return NInfo.EmitVAArg(VAListAddr, Ty, CGF); } // PowerPC-32 namespace { /// PPC32_SVR4_ABIInfo - The 32-bit PowerPC ELF (SVR4) ABI information. class PPC32_SVR4_ABIInfo : public DefaultABIInfo { public: PPC32_SVR4_ABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {} llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class PPC32TargetCodeGenInfo : public TargetCodeGenInfo { public: PPC32TargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(new PPC32_SVR4_ABIInfo(CGT)) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override { // This is recovered from gcc output. return 1; // r1 is the dedicated stack pointer } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override; unsigned getOpenMPSimdDefaultAlignment(QualType) const override { return 16; // Natural alignment for Altivec vectors. } }; } llvm::Value *PPC32_SVR4_ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { if (const ComplexType *CTy = Ty->getAs()) { // TODO: Implement this. For now ignore. (void)CTy; return nullptr; } bool isI64 = Ty->isIntegerType() && getContext().getTypeSize(Ty) == 64; bool isInt = Ty->isIntegerType() || Ty->isPointerType() || Ty->isAggregateType(); llvm::Type *CharPtr = CGF.Int8PtrTy; llvm::Type *CharPtrPtr = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *GPRPtr = Builder.CreateBitCast(VAListAddr, CharPtr, "gprptr"); llvm::Value *GPRPtrAsInt = Builder.CreatePtrToInt(GPRPtr, CGF.Int32Ty); llvm::Value *FPRPtrAsInt = Builder.CreateAdd(GPRPtrAsInt, Builder.getInt32(1)); llvm::Value *FPRPtr = Builder.CreateIntToPtr(FPRPtrAsInt, CharPtr); llvm::Value *OverflowAreaPtrAsInt = Builder.CreateAdd(FPRPtrAsInt, Builder.getInt32(3)); llvm::Value *OverflowAreaPtr = Builder.CreateIntToPtr(OverflowAreaPtrAsInt, CharPtrPtr); llvm::Value *RegsaveAreaPtrAsInt = Builder.CreateAdd(OverflowAreaPtrAsInt, Builder.getInt32(4)); llvm::Value *RegsaveAreaPtr = Builder.CreateIntToPtr(RegsaveAreaPtrAsInt, CharPtrPtr); llvm::Value *GPR = Builder.CreateLoad(GPRPtr, false, "gpr"); // Align GPR when TY is i64. if (isI64) { llvm::Value *GPRAnd = Builder.CreateAnd(GPR, Builder.getInt8(1)); llvm::Value *CC64 = Builder.CreateICmpEQ(GPRAnd, Builder.getInt8(1)); llvm::Value *GPRPlusOne = Builder.CreateAdd(GPR, Builder.getInt8(1)); GPR = Builder.CreateSelect(CC64, GPRPlusOne, GPR); } llvm::Value *FPR = Builder.CreateLoad(FPRPtr, false, "fpr"); llvm::Value *OverflowArea = Builder.CreateLoad(OverflowAreaPtr, false, "overflow_area"); llvm::Value *OverflowAreaAsInt = Builder.CreatePtrToInt(OverflowArea, CGF.Int32Ty); llvm::Value *RegsaveArea = Builder.CreateLoad(RegsaveAreaPtr, false, "regsave_area"); llvm::Value *RegsaveAreaAsInt = Builder.CreatePtrToInt(RegsaveArea, CGF.Int32Ty); llvm::Value *CC = Builder.CreateICmpULT(isInt ? GPR : FPR, Builder.getInt8(8), "cond"); llvm::Value *RegConstant = Builder.CreateMul(isInt ? GPR : FPR, Builder.getInt8(isInt ? 4 : 8)); llvm::Value *OurReg = Builder.CreateAdd(RegsaveAreaAsInt, Builder.CreateSExt(RegConstant, CGF.Int32Ty)); if (Ty->isFloatingType()) OurReg = Builder.CreateAdd(OurReg, Builder.getInt32(32)); llvm::BasicBlock *UsingRegs = CGF.createBasicBlock("using_regs"); llvm::BasicBlock *UsingOverflow = CGF.createBasicBlock("using_overflow"); llvm::BasicBlock *Cont = CGF.createBasicBlock("cont"); Builder.CreateCondBr(CC, UsingRegs, UsingOverflow); CGF.EmitBlock(UsingRegs); llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *Result1 = Builder.CreateIntToPtr(OurReg, PTy); // Increase the GPR/FPR indexes. if (isInt) { GPR = Builder.CreateAdd(GPR, Builder.getInt8(isI64 ? 2 : 1)); Builder.CreateStore(GPR, GPRPtr); } else { FPR = Builder.CreateAdd(FPR, Builder.getInt8(1)); Builder.CreateStore(FPR, FPRPtr); } CGF.EmitBranch(Cont); CGF.EmitBlock(UsingOverflow); // Increase the overflow area. llvm::Value *Result2 = Builder.CreateIntToPtr(OverflowAreaAsInt, PTy); OverflowAreaAsInt = Builder.CreateAdd(OverflowAreaAsInt, Builder.getInt32(isInt ? 4 : 8)); Builder.CreateStore(Builder.CreateIntToPtr(OverflowAreaAsInt, CharPtr), OverflowAreaPtr); CGF.EmitBranch(Cont); CGF.EmitBlock(Cont); llvm::PHINode *Result = CGF.Builder.CreatePHI(PTy, 2, "vaarg.addr"); Result->addIncoming(Result1, UsingRegs); Result->addIncoming(Result2, UsingOverflow); if (Ty->isAggregateType()) { llvm::Value *AGGPtr = Builder.CreateBitCast(Result, CharPtrPtr, "aggrptr") ; return Builder.CreateLoad(AGGPtr, false, "aggr"); } return Result; } bool PPC32TargetCodeGenInfo::initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const { // This is calculated from the LLVM and GCC tables and verified // against gcc output. AFAIK all ABIs use the same encoding. CodeGen::CGBuilderTy &Builder = CGF.Builder; llvm::IntegerType *i8 = CGF.Int8Ty; llvm::Value *Four8 = llvm::ConstantInt::get(i8, 4); llvm::Value *Eight8 = llvm::ConstantInt::get(i8, 8); llvm::Value *Sixteen8 = llvm::ConstantInt::get(i8, 16); // 0-31: r0-31, the 4-byte general-purpose registers AssignToArrayRange(Builder, Address, Four8, 0, 31); // 32-63: fp0-31, the 8-byte floating-point registers AssignToArrayRange(Builder, Address, Eight8, 32, 63); // 64-76 are various 4-byte special-purpose registers: // 64: mq // 65: lr // 66: ctr // 67: ap // 68-75 cr0-7 // 76: xer AssignToArrayRange(Builder, Address, Four8, 64, 76); // 77-108: v0-31, the 16-byte vector registers AssignToArrayRange(Builder, Address, Sixteen8, 77, 108); // 109: vrsave // 110: vscr // 111: spe_acc // 112: spefscr // 113: sfp AssignToArrayRange(Builder, Address, Four8, 109, 113); return false; } // PowerPC-64 namespace { /// PPC64_SVR4_ABIInfo - The 64-bit PowerPC ELF (SVR4) ABI information. class PPC64_SVR4_ABIInfo : public DefaultABIInfo { public: enum ABIKind { ELFv1 = 0, ELFv2 }; private: static const unsigned GPRBits = 64; ABIKind Kind; public: PPC64_SVR4_ABIInfo(CodeGen::CodeGenTypes &CGT, ABIKind Kind) : DefaultABIInfo(CGT), Kind(Kind) {} bool isPromotableTypeForABI(QualType Ty) const; bool isAlignedParamType(QualType Ty) const; ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType Ty) const; bool isHomogeneousAggregateBaseType(QualType Ty) const override; bool isHomogeneousAggregateSmallEnough(const Type *Ty, uint64_t Members) const override; // TODO: We can add more logic to computeInfo to improve performance. // Example: For aggregate arguments that fit in a register, we could // use getDirectInReg (as is done below for structs containing a single // floating-point value) to avoid pushing them to memory on function // entry. This would require changing the logic in PPCISelLowering // when lowering the parameters in the caller and args in the callee. void computeInfo(CGFunctionInfo &FI) const override { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &I : FI.arguments()) { // We rely on the default argument classification for the most part. // One exception: An aggregate containing a single floating-point // or vector item must be passed in a register if one is available. const Type *T = isSingleElementStruct(I.type, getContext()); if (T) { const BuiltinType *BT = T->getAs(); if ((T->isVectorType() && getContext().getTypeSize(T) == 128) || (BT && BT->isFloatingPoint())) { QualType QT(T, 0); I.info = ABIArgInfo::getDirectInReg(CGT.ConvertType(QT)); continue; } } I.info = classifyArgumentType(I.type); } } llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class PPC64_SVR4_TargetCodeGenInfo : public TargetCodeGenInfo { public: PPC64_SVR4_TargetCodeGenInfo(CodeGenTypes &CGT, PPC64_SVR4_ABIInfo::ABIKind Kind) : TargetCodeGenInfo(new PPC64_SVR4_ABIInfo(CGT, Kind)) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override { // This is recovered from gcc output. return 1; // r1 is the dedicated stack pointer } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override; unsigned getOpenMPSimdDefaultAlignment(QualType) const override { return 16; // Natural alignment for Altivec and VSX vectors. } }; class PPC64TargetCodeGenInfo : public DefaultTargetCodeGenInfo { public: PPC64TargetCodeGenInfo(CodeGenTypes &CGT) : DefaultTargetCodeGenInfo(CGT) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override { // This is recovered from gcc output. return 1; // r1 is the dedicated stack pointer } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override; unsigned getOpenMPSimdDefaultAlignment(QualType) const override { return 16; // Natural alignment for Altivec vectors. } }; } // Return true if the ABI requires Ty to be passed sign- or zero- // extended to 64 bits. bool PPC64_SVR4_ABIInfo::isPromotableTypeForABI(QualType Ty) const { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); // Promotable integer types are required to be promoted by the ABI. if (Ty->isPromotableIntegerType()) return true; // In addition to the usual promotable integer types, we also need to // extend all 32-bit types, since the ABI requires promotion to 64 bits. if (const BuiltinType *BT = Ty->getAs()) switch (BT->getKind()) { case BuiltinType::Int: case BuiltinType::UInt: return true; default: break; } return false; } /// isAlignedParamType - Determine whether a type requires 16-byte /// alignment in the parameter area. bool PPC64_SVR4_ABIInfo::isAlignedParamType(QualType Ty) const { // Complex types are passed just like their elements. if (const ComplexType *CTy = Ty->getAs()) Ty = CTy->getElementType(); // Only vector types of size 16 bytes need alignment (larger types are // passed via reference, smaller types are not aligned). if (Ty->isVectorType()) return getContext().getTypeSize(Ty) == 128; // For single-element float/vector structs, we consider the whole type // to have the same alignment requirements as its single element. const Type *AlignAsType = nullptr; const Type *EltType = isSingleElementStruct(Ty, getContext()); if (EltType) { const BuiltinType *BT = EltType->getAs(); if ((EltType->isVectorType() && getContext().getTypeSize(EltType) == 128) || (BT && BT->isFloatingPoint())) AlignAsType = EltType; } // Likewise for ELFv2 homogeneous aggregates. const Type *Base = nullptr; uint64_t Members = 0; if (!AlignAsType && Kind == ELFv2 && isAggregateTypeForABI(Ty) && isHomogeneousAggregate(Ty, Base, Members)) AlignAsType = Base; // With special case aggregates, only vector base types need alignment. if (AlignAsType) return AlignAsType->isVectorType(); // Otherwise, we only need alignment for any aggregate type that // has an alignment requirement of >= 16 bytes. if (isAggregateTypeForABI(Ty) && getContext().getTypeAlign(Ty) >= 128) return true; return false; } /// isHomogeneousAggregate - Return true if a type is an ELFv2 homogeneous /// aggregate. Base is set to the base element type, and Members is set /// to the number of base elements. bool ABIInfo::isHomogeneousAggregate(QualType Ty, const Type *&Base, uint64_t &Members) const { if (const ConstantArrayType *AT = getContext().getAsConstantArrayType(Ty)) { uint64_t NElements = AT->getSize().getZExtValue(); if (NElements == 0) return false; if (!isHomogeneousAggregate(AT->getElementType(), Base, Members)) return false; Members *= NElements; } else if (const RecordType *RT = Ty->getAs()) { const RecordDecl *RD = RT->getDecl(); if (RD->hasFlexibleArrayMember()) return false; Members = 0; // If this is a C++ record, check the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) { for (const auto &I : CXXRD->bases()) { // Ignore empty records. if (isEmptyRecord(getContext(), I.getType(), true)) continue; uint64_t FldMembers; if (!isHomogeneousAggregate(I.getType(), Base, FldMembers)) return false; Members += FldMembers; } } for (const auto *FD : RD->fields()) { // Ignore (non-zero arrays of) empty records. QualType FT = FD->getType(); while (const ConstantArrayType *AT = getContext().getAsConstantArrayType(FT)) { if (AT->getSize().getZExtValue() == 0) return false; FT = AT->getElementType(); } if (isEmptyRecord(getContext(), FT, true)) continue; // For compatibility with GCC, ignore empty bitfields in C++ mode. if (getContext().getLangOpts().CPlusPlus && FD->isBitField() && FD->getBitWidthValue(getContext()) == 0) continue; uint64_t FldMembers; if (!isHomogeneousAggregate(FD->getType(), Base, FldMembers)) return false; Members = (RD->isUnion() ? std::max(Members, FldMembers) : Members + FldMembers); } if (!Base) return false; // Ensure there is no padding. if (getContext().getTypeSize(Base) * Members != getContext().getTypeSize(Ty)) return false; } else { Members = 1; if (const ComplexType *CT = Ty->getAs()) { Members = 2; Ty = CT->getElementType(); } // Most ABIs only support float, double, and some vector type widths. if (!isHomogeneousAggregateBaseType(Ty)) return false; // The base type must be the same for all members. Types that // agree in both total size and mode (float vs. vector) are // treated as being equivalent here. const Type *TyPtr = Ty.getTypePtr(); if (!Base) Base = TyPtr; if (Base->isVectorType() != TyPtr->isVectorType() || getContext().getTypeSize(Base) != getContext().getTypeSize(TyPtr)) return false; } return Members > 0 && isHomogeneousAggregateSmallEnough(Base, Members); } bool PPC64_SVR4_ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { // Homogeneous aggregates for ELFv2 must have base types of float, // double, long double, or 128-bit vectors. if (const BuiltinType *BT = Ty->getAs()) { if (BT->getKind() == BuiltinType::Float || BT->getKind() == BuiltinType::Double || BT->getKind() == BuiltinType::LongDouble) return true; } if (const VectorType *VT = Ty->getAs()) { if (getContext().getTypeSize(VT) == 128) return true; } return false; } bool PPC64_SVR4_ABIInfo::isHomogeneousAggregateSmallEnough( const Type *Base, uint64_t Members) const { // Vector types require one register, floating point types require one // or two registers depending on their size. uint32_t NumRegs = Base->isVectorType() ? 1 : (getContext().getTypeSize(Base) + 63) / 64; // Homogeneous Aggregates may occupy at most 8 registers. return Members * NumRegs <= 8; } ABIArgInfo PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const { Ty = useFirstFieldIfTransparentUnion(Ty); if (Ty->isAnyComplexType()) return ABIArgInfo::getDirect(); // Non-Altivec vector types are passed in GPRs (smaller than 16 bytes) // or via reference (larger than 16 bytes). if (Ty->isVectorType()) { uint64_t Size = getContext().getTypeSize(Ty); if (Size > 128) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); else if (Size < 128) { llvm::Type *CoerceTy = llvm::IntegerType::get(getVMContext(), Size); return ABIArgInfo::getDirect(CoerceTy); } } if (isAggregateTypeForABI(Ty)) { if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); uint64_t ABIAlign = isAlignedParamType(Ty)? 16 : 8; uint64_t TyAlign = getContext().getTypeAlign(Ty) / 8; // ELFv2 homogeneous aggregates are passed as array types. const Type *Base = nullptr; uint64_t Members = 0; if (Kind == ELFv2 && isHomogeneousAggregate(Ty, Base, Members)) { llvm::Type *BaseTy = CGT.ConvertType(QualType(Base, 0)); llvm::Type *CoerceTy = llvm::ArrayType::get(BaseTy, Members); return ABIArgInfo::getDirect(CoerceTy); } // If an aggregate may end up fully in registers, we do not // use the ByVal method, but pass the aggregate as array. // This is usually beneficial since we avoid forcing the // back-end to store the argument to memory. uint64_t Bits = getContext().getTypeSize(Ty); if (Bits > 0 && Bits <= 8 * GPRBits) { llvm::Type *CoerceTy; // Types up to 8 bytes are passed as integer type (which will be // properly aligned in the argument save area doubleword). if (Bits <= GPRBits) CoerceTy = llvm::IntegerType::get(getVMContext(), llvm::RoundUpToAlignment(Bits, 8)); // Larger types are passed as arrays, with the base type selected // according to the required alignment in the save area. else { uint64_t RegBits = ABIAlign * 8; uint64_t NumRegs = llvm::RoundUpToAlignment(Bits, RegBits) / RegBits; llvm::Type *RegTy = llvm::IntegerType::get(getVMContext(), RegBits); CoerceTy = llvm::ArrayType::get(RegTy, NumRegs); } return ABIArgInfo::getDirect(CoerceTy); } // All other aggregates are passed ByVal. return ABIArgInfo::getIndirect(ABIAlign, /*ByVal=*/true, /*Realign=*/TyAlign > ABIAlign); } return (isPromotableTypeForABI(Ty) ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } ABIArgInfo PPC64_SVR4_ABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); if (RetTy->isAnyComplexType()) return ABIArgInfo::getDirect(); // Non-Altivec vector types are returned in GPRs (smaller than 16 bytes) // or via reference (larger than 16 bytes). if (RetTy->isVectorType()) { uint64_t Size = getContext().getTypeSize(RetTy); if (Size > 128) return ABIArgInfo::getIndirect(0); else if (Size < 128) { llvm::Type *CoerceTy = llvm::IntegerType::get(getVMContext(), Size); return ABIArgInfo::getDirect(CoerceTy); } } if (isAggregateTypeForABI(RetTy)) { // ELFv2 homogeneous aggregates are returned as array types. const Type *Base = nullptr; uint64_t Members = 0; if (Kind == ELFv2 && isHomogeneousAggregate(RetTy, Base, Members)) { llvm::Type *BaseTy = CGT.ConvertType(QualType(Base, 0)); llvm::Type *CoerceTy = llvm::ArrayType::get(BaseTy, Members); return ABIArgInfo::getDirect(CoerceTy); } // ELFv2 small aggregates are returned in up to two registers. uint64_t Bits = getContext().getTypeSize(RetTy); if (Kind == ELFv2 && Bits <= 2 * GPRBits) { if (Bits == 0) return ABIArgInfo::getIgnore(); llvm::Type *CoerceTy; if (Bits > GPRBits) { CoerceTy = llvm::IntegerType::get(getVMContext(), GPRBits); CoerceTy = llvm::StructType::get(CoerceTy, CoerceTy, nullptr); } else CoerceTy = llvm::IntegerType::get(getVMContext(), llvm::RoundUpToAlignment(Bits, 8)); return ABIArgInfo::getDirect(CoerceTy); } // All other aggregates are returned indirectly. return ABIArgInfo::getIndirect(0); } return (isPromotableTypeForABI(RetTy) ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } // Based on ARMABIInfo::EmitVAArg, adjusted for 64-bit machine. llvm::Value *PPC64_SVR4_ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { llvm::Type *BP = CGF.Int8PtrTy; llvm::Type *BPP = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); // Handle types that require 16-byte alignment in the parameter save area. if (isAlignedParamType(Ty)) { llvm::Value *AddrAsInt = Builder.CreatePtrToInt(Addr, CGF.Int64Ty); AddrAsInt = Builder.CreateAdd(AddrAsInt, Builder.getInt64(15)); AddrAsInt = Builder.CreateAnd(AddrAsInt, Builder.getInt64(-16)); Addr = Builder.CreateIntToPtr(AddrAsInt, BP, "ap.align"); } // Update the va_list pointer. The pointer should be bumped by the // size of the object. We can trust getTypeSize() except for a complex // type whose base type is smaller than a doubleword. For these, the // size of the object is 16 bytes; see below for further explanation. unsigned SizeInBytes = CGF.getContext().getTypeSize(Ty) / 8; QualType BaseTy; unsigned CplxBaseSize = 0; if (const ComplexType *CTy = Ty->getAs()) { BaseTy = CTy->getElementType(); CplxBaseSize = CGF.getContext().getTypeSize(BaseTy) / 8; if (CplxBaseSize < 8) SizeInBytes = 16; } unsigned Offset = llvm::RoundUpToAlignment(SizeInBytes, 8); llvm::Value *NextAddr = Builder.CreateGEP(Addr, llvm::ConstantInt::get(CGF.Int64Ty, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); // If we have a complex type and the base type is smaller than 8 bytes, // the ABI calls for the real and imaginary parts to be right-adjusted // in separate doublewords. However, Clang expects us to produce a // pointer to a structure with the two parts packed tightly. So generate // loads of the real and imaginary parts relative to the va_list pointer, // and store them to a temporary structure. if (CplxBaseSize && CplxBaseSize < 8) { llvm::Value *RealAddr = Builder.CreatePtrToInt(Addr, CGF.Int64Ty); llvm::Value *ImagAddr = RealAddr; if (CGF.CGM.getDataLayout().isBigEndian()) { RealAddr = Builder.CreateAdd(RealAddr, Builder.getInt64(8 - CplxBaseSize)); ImagAddr = Builder.CreateAdd(ImagAddr, Builder.getInt64(16 - CplxBaseSize)); } else { ImagAddr = Builder.CreateAdd(ImagAddr, Builder.getInt64(8)); } llvm::Type *PBaseTy = llvm::PointerType::getUnqual(CGF.ConvertType(BaseTy)); RealAddr = Builder.CreateIntToPtr(RealAddr, PBaseTy); ImagAddr = Builder.CreateIntToPtr(ImagAddr, PBaseTy); llvm::Value *Real = Builder.CreateLoad(RealAddr, false, ".vareal"); llvm::Value *Imag = Builder.CreateLoad(ImagAddr, false, ".vaimag"); llvm::Value *Ptr = CGF.CreateTempAlloca(CGT.ConvertTypeForMem(Ty), "vacplx"); llvm::Value *RealPtr = Builder.CreateStructGEP(Ptr, 0, ".real"); llvm::Value *ImagPtr = Builder.CreateStructGEP(Ptr, 1, ".imag"); Builder.CreateStore(Real, RealPtr, false); Builder.CreateStore(Imag, ImagPtr, false); return Ptr; } // If the argument is smaller than 8 bytes, it is right-adjusted in // its doubleword slot. Adjust the pointer to pick it up from the // correct offset. if (SizeInBytes < 8 && CGF.CGM.getDataLayout().isBigEndian()) { llvm::Value *AddrAsInt = Builder.CreatePtrToInt(Addr, CGF.Int64Ty); AddrAsInt = Builder.CreateAdd(AddrAsInt, Builder.getInt64(8 - SizeInBytes)); Addr = Builder.CreateIntToPtr(AddrAsInt, BP); } llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); return Builder.CreateBitCast(Addr, PTy); } static bool PPC64_initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) { // This is calculated from the LLVM and GCC tables and verified // against gcc output. AFAIK all ABIs use the same encoding. CodeGen::CGBuilderTy &Builder = CGF.Builder; llvm::IntegerType *i8 = CGF.Int8Ty; llvm::Value *Four8 = llvm::ConstantInt::get(i8, 4); llvm::Value *Eight8 = llvm::ConstantInt::get(i8, 8); llvm::Value *Sixteen8 = llvm::ConstantInt::get(i8, 16); // 0-31: r0-31, the 8-byte general-purpose registers AssignToArrayRange(Builder, Address, Eight8, 0, 31); // 32-63: fp0-31, the 8-byte floating-point registers AssignToArrayRange(Builder, Address, Eight8, 32, 63); // 64-76 are various 4-byte special-purpose registers: // 64: mq // 65: lr // 66: ctr // 67: ap // 68-75 cr0-7 // 76: xer AssignToArrayRange(Builder, Address, Four8, 64, 76); // 77-108: v0-31, the 16-byte vector registers AssignToArrayRange(Builder, Address, Sixteen8, 77, 108); // 109: vrsave // 110: vscr // 111: spe_acc // 112: spefscr // 113: sfp AssignToArrayRange(Builder, Address, Four8, 109, 113); return false; } bool PPC64_SVR4_TargetCodeGenInfo::initDwarfEHRegSizeTable( CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const { return PPC64_initDwarfEHRegSizeTable(CGF, Address); } bool PPC64TargetCodeGenInfo::initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const { return PPC64_initDwarfEHRegSizeTable(CGF, Address); } //===----------------------------------------------------------------------===// // AArch64 ABI Implementation //===----------------------------------------------------------------------===// namespace { class AArch64ABIInfo : public ABIInfo { public: enum ABIKind { AAPCS = 0, DarwinPCS }; private: ABIKind Kind; public: AArch64ABIInfo(CodeGenTypes &CGT, ABIKind Kind) : ABIInfo(CGT), Kind(Kind) {} private: ABIKind getABIKind() const { return Kind; } bool isDarwinPCS() const { return Kind == DarwinPCS; } ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType RetTy) const; bool isHomogeneousAggregateBaseType(QualType Ty) const override; bool isHomogeneousAggregateSmallEnough(const Type *Ty, uint64_t Members) const override; bool isIllegalVectorType(QualType Ty) const; void computeInfo(CGFunctionInfo &FI) const override { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &it : FI.arguments()) it.info = classifyArgumentType(it.type); } llvm::Value *EmitDarwinVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const; llvm::Value *EmitAAPCSVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const; virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override { return isDarwinPCS() ? EmitDarwinVAArg(VAListAddr, Ty, CGF) : EmitAAPCSVAArg(VAListAddr, Ty, CGF); } }; class AArch64TargetCodeGenInfo : public TargetCodeGenInfo { public: AArch64TargetCodeGenInfo(CodeGenTypes &CGT, AArch64ABIInfo::ABIKind Kind) : TargetCodeGenInfo(new AArch64ABIInfo(CGT, Kind)) {} StringRef getARCRetainAutoreleasedReturnValueMarker() const { return "mov\tfp, fp\t\t; marker for objc_retainAutoreleaseReturnValue"; } int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const { return 31; } virtual bool doesReturnSlotInterfereWithArgs() const { return false; } }; } ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty) const { Ty = useFirstFieldIfTransparentUnion(Ty); // Handle illegal vector types here. if (isIllegalVectorType(Ty)) { uint64_t Size = getContext().getTypeSize(Ty); if (Size <= 32) { llvm::Type *ResType = llvm::Type::getInt32Ty(getVMContext()); return ABIArgInfo::getDirect(ResType); } if (Size == 64) { llvm::Type *ResType = llvm::VectorType::get(llvm::Type::getInt32Ty(getVMContext()), 2); return ABIArgInfo::getDirect(ResType); } if (Size == 128) { llvm::Type *ResType = llvm::VectorType::get(llvm::Type::getInt32Ty(getVMContext()), 4); return ABIArgInfo::getDirect(ResType); } return ABIArgInfo::getIndirect(0, /*ByVal=*/false); } if (!isAggregateTypeForABI(Ty)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); return (Ty->isPromotableIntegerType() && isDarwinPCS() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } // Structures with either a non-trivial destructor or a non-trivial // copy constructor are always indirect. if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) { return ABIArgInfo::getIndirect(0, /*ByVal=*/RAA == CGCXXABI::RAA_DirectInMemory); } // Empty records are always ignored on Darwin, but actually passed in C++ mode // elsewhere for GNU compatibility. if (isEmptyRecord(getContext(), Ty, true)) { if (!getContext().getLangOpts().CPlusPlus || isDarwinPCS()) return ABIArgInfo::getIgnore(); return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext())); } // Homogeneous Floating-point Aggregates (HFAs) need to be expanded. const Type *Base = nullptr; uint64_t Members = 0; if (isHomogeneousAggregate(Ty, Base, Members)) { return ABIArgInfo::getDirect( llvm::ArrayType::get(CGT.ConvertType(QualType(Base, 0)), Members)); } // Aggregates <= 16 bytes are passed directly in registers or on the stack. uint64_t Size = getContext().getTypeSize(Ty); if (Size <= 128) { unsigned Alignment = getContext().getTypeAlign(Ty); Size = 64 * ((Size + 63) / 64); // round up to multiple of 8 bytes // We use a pair of i64 for 16-byte aggregate with 8-byte alignment. // For aggregates with 16-byte alignment, we use i128. if (Alignment < 128 && Size == 128) { llvm::Type *BaseTy = llvm::Type::getInt64Ty(getVMContext()); return ABIArgInfo::getDirect(llvm::ArrayType::get(BaseTy, Size / 64)); } return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size)); } return ABIArgInfo::getIndirect(0, /*ByVal=*/false); } ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); // Large vector types should be returned via memory. if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) return ABIArgInfo::getIndirect(0); if (!isAggregateTypeForABI(RetTy)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() && isDarwinPCS() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } if (isEmptyRecord(getContext(), RetTy, true)) return ABIArgInfo::getIgnore(); const Type *Base = nullptr; uint64_t Members = 0; if (isHomogeneousAggregate(RetTy, Base, Members)) // Homogeneous Floating-point Aggregates (HFAs) are returned directly. return ABIArgInfo::getDirect(); // Aggregates <= 16 bytes are returned directly in registers or on the stack. uint64_t Size = getContext().getTypeSize(RetTy); if (Size <= 128) { Size = 64 * ((Size + 63) / 64); // round up to multiple of 8 bytes return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size)); } return ABIArgInfo::getIndirect(0); } /// isIllegalVectorType - check whether the vector type is legal for AArch64. bool AArch64ABIInfo::isIllegalVectorType(QualType Ty) const { if (const VectorType *VT = Ty->getAs()) { // Check whether VT is legal. unsigned NumElements = VT->getNumElements(); uint64_t Size = getContext().getTypeSize(VT); // NumElements should be power of 2 between 1 and 16. if ((NumElements & (NumElements - 1)) != 0 || NumElements > 16) return true; return Size != 64 && (Size != 128 || NumElements == 1); } return false; } bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { // Homogeneous aggregates for AAPCS64 must have base types of a floating // point type or a short-vector type. This is the same as the 32-bit ABI, // but with the difference that any floating-point type is allowed, // including __fp16. if (const BuiltinType *BT = Ty->getAs()) { if (BT->isFloatingPoint()) return true; } else if (const VectorType *VT = Ty->getAs()) { unsigned VecSize = getContext().getTypeSize(VT); if (VecSize == 64 || VecSize == 128) return true; } return false; } bool AArch64ABIInfo::isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const { return Members <= 4; } llvm::Value *AArch64ABIInfo::EmitAAPCSVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { ABIArgInfo AI = classifyArgumentType(Ty); bool IsIndirect = AI.isIndirect(); llvm::Type *BaseTy = CGF.ConvertType(Ty); if (IsIndirect) BaseTy = llvm::PointerType::getUnqual(BaseTy); else if (AI.getCoerceToType()) BaseTy = AI.getCoerceToType(); unsigned NumRegs = 1; if (llvm::ArrayType *ArrTy = dyn_cast(BaseTy)) { BaseTy = ArrTy->getElementType(); NumRegs = ArrTy->getNumElements(); } bool IsFPR = BaseTy->isFloatingPointTy() || BaseTy->isVectorTy(); // The AArch64 va_list type and handling is specified in the Procedure Call // Standard, section B.4: // // struct { // void *__stack; // void *__gr_top; // void *__vr_top; // int __gr_offs; // int __vr_offs; // }; llvm::BasicBlock *MaybeRegBlock = CGF.createBasicBlock("vaarg.maybe_reg"); llvm::BasicBlock *InRegBlock = CGF.createBasicBlock("vaarg.in_reg"); llvm::BasicBlock *OnStackBlock = CGF.createBasicBlock("vaarg.on_stack"); llvm::BasicBlock *ContBlock = CGF.createBasicBlock("vaarg.end"); auto &Ctx = CGF.getContext(); llvm::Value *reg_offs_p = nullptr, *reg_offs = nullptr; int reg_top_index; int RegSize = IsIndirect ? 8 : getContext().getTypeSize(Ty) / 8; if (!IsFPR) { // 3 is the field number of __gr_offs reg_offs_p = CGF.Builder.CreateStructGEP(VAListAddr, 3, "gr_offs_p"); reg_offs = CGF.Builder.CreateLoad(reg_offs_p, "gr_offs"); reg_top_index = 1; // field number for __gr_top RegSize = llvm::RoundUpToAlignment(RegSize, 8); } else { // 4 is the field number of __vr_offs. reg_offs_p = CGF.Builder.CreateStructGEP(VAListAddr, 4, "vr_offs_p"); reg_offs = CGF.Builder.CreateLoad(reg_offs_p, "vr_offs"); reg_top_index = 2; // field number for __vr_top RegSize = 16 * NumRegs; } //======================================= // Find out where argument was passed //======================================= // If reg_offs >= 0 we're already using the stack for this type of // argument. We don't want to keep updating reg_offs (in case it overflows, // though anyone passing 2GB of arguments, each at most 16 bytes, deserves // whatever they get). llvm::Value *UsingStack = nullptr; UsingStack = CGF.Builder.CreateICmpSGE( reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, 0)); CGF.Builder.CreateCondBr(UsingStack, OnStackBlock, MaybeRegBlock); // Otherwise, at least some kind of argument could go in these registers, the // question is whether this particular type is too big. CGF.EmitBlock(MaybeRegBlock); // Integer arguments may need to correct register alignment (for example a // "struct { __int128 a; };" gets passed in x_2N, x_{2N+1}). In this case we // align __gr_offs to calculate the potential address. if (!IsFPR && !IsIndirect && Ctx.getTypeAlign(Ty) > 64) { int Align = Ctx.getTypeAlign(Ty) / 8; reg_offs = CGF.Builder.CreateAdd( reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, Align - 1), "align_regoffs"); reg_offs = CGF.Builder.CreateAnd( reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, -Align), "aligned_regoffs"); } // Update the gr_offs/vr_offs pointer for next call to va_arg on this va_list. llvm::Value *NewOffset = nullptr; NewOffset = CGF.Builder.CreateAdd( reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, RegSize), "new_reg_offs"); CGF.Builder.CreateStore(NewOffset, reg_offs_p); // Now we're in a position to decide whether this argument really was in // registers or not. llvm::Value *InRegs = nullptr; InRegs = CGF.Builder.CreateICmpSLE( NewOffset, llvm::ConstantInt::get(CGF.Int32Ty, 0), "inreg"); CGF.Builder.CreateCondBr(InRegs, InRegBlock, OnStackBlock); //======================================= // Argument was in registers //======================================= // Now we emit the code for if the argument was originally passed in // registers. First start the appropriate block: CGF.EmitBlock(InRegBlock); llvm::Value *reg_top_p = nullptr, *reg_top = nullptr; reg_top_p = CGF.Builder.CreateStructGEP(VAListAddr, reg_top_index, "reg_top_p"); reg_top = CGF.Builder.CreateLoad(reg_top_p, "reg_top"); llvm::Value *BaseAddr = CGF.Builder.CreateGEP(reg_top, reg_offs); llvm::Value *RegAddr = nullptr; llvm::Type *MemTy = llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty)); if (IsIndirect) { // If it's been passed indirectly (actually a struct), whatever we find from // stored registers or on the stack will actually be a struct **. MemTy = llvm::PointerType::getUnqual(MemTy); } const Type *Base = nullptr; uint64_t NumMembers = 0; bool IsHFA = isHomogeneousAggregate(Ty, Base, NumMembers); if (IsHFA && NumMembers > 1) { // Homogeneous aggregates passed in registers will have their elements split // and stored 16-bytes apart regardless of size (they're notionally in qN, // qN+1, ...). We reload and store into a temporary local variable // contiguously. assert(!IsIndirect && "Homogeneous aggregates should be passed directly"); llvm::Type *BaseTy = CGF.ConvertType(QualType(Base, 0)); llvm::Type *HFATy = llvm::ArrayType::get(BaseTy, NumMembers); llvm::Value *Tmp = CGF.CreateTempAlloca(HFATy); int Offset = 0; if (CGF.CGM.getDataLayout().isBigEndian() && Ctx.getTypeSize(Base) < 128) Offset = 16 - Ctx.getTypeSize(Base) / 8; for (unsigned i = 0; i < NumMembers; ++i) { llvm::Value *BaseOffset = llvm::ConstantInt::get(CGF.Int32Ty, 16 * i + Offset); llvm::Value *LoadAddr = CGF.Builder.CreateGEP(BaseAddr, BaseOffset); LoadAddr = CGF.Builder.CreateBitCast( LoadAddr, llvm::PointerType::getUnqual(BaseTy)); llvm::Value *StoreAddr = CGF.Builder.CreateStructGEP(Tmp, i); llvm::Value *Elem = CGF.Builder.CreateLoad(LoadAddr); CGF.Builder.CreateStore(Elem, StoreAddr); } RegAddr = CGF.Builder.CreateBitCast(Tmp, MemTy); } else { // Otherwise the object is contiguous in memory unsigned BeAlign = reg_top_index == 2 ? 16 : 8; if (CGF.CGM.getDataLayout().isBigEndian() && (IsHFA || !isAggregateTypeForABI(Ty)) && Ctx.getTypeSize(Ty) < (BeAlign * 8)) { int Offset = BeAlign - Ctx.getTypeSize(Ty) / 8; BaseAddr = CGF.Builder.CreatePtrToInt(BaseAddr, CGF.Int64Ty); BaseAddr = CGF.Builder.CreateAdd( BaseAddr, llvm::ConstantInt::get(CGF.Int64Ty, Offset), "align_be"); BaseAddr = CGF.Builder.CreateIntToPtr(BaseAddr, CGF.Int8PtrTy); } RegAddr = CGF.Builder.CreateBitCast(BaseAddr, MemTy); } CGF.EmitBranch(ContBlock); //======================================= // Argument was on the stack //======================================= CGF.EmitBlock(OnStackBlock); llvm::Value *stack_p = nullptr, *OnStackAddr = nullptr; stack_p = CGF.Builder.CreateStructGEP(VAListAddr, 0, "stack_p"); OnStackAddr = CGF.Builder.CreateLoad(stack_p, "stack"); // Again, stack arguments may need realigmnent. In this case both integer and // floating-point ones might be affected. if (!IsIndirect && Ctx.getTypeAlign(Ty) > 64) { int Align = Ctx.getTypeAlign(Ty) / 8; OnStackAddr = CGF.Builder.CreatePtrToInt(OnStackAddr, CGF.Int64Ty); OnStackAddr = CGF.Builder.CreateAdd( OnStackAddr, llvm::ConstantInt::get(CGF.Int64Ty, Align - 1), "align_stack"); OnStackAddr = CGF.Builder.CreateAnd( OnStackAddr, llvm::ConstantInt::get(CGF.Int64Ty, -Align), "align_stack"); OnStackAddr = CGF.Builder.CreateIntToPtr(OnStackAddr, CGF.Int8PtrTy); } uint64_t StackSize; if (IsIndirect) StackSize = 8; else StackSize = Ctx.getTypeSize(Ty) / 8; // All stack slots are 8 bytes StackSize = llvm::RoundUpToAlignment(StackSize, 8); llvm::Value *StackSizeC = llvm::ConstantInt::get(CGF.Int32Ty, StackSize); llvm::Value *NewStack = CGF.Builder.CreateGEP(OnStackAddr, StackSizeC, "new_stack"); // Write the new value of __stack for the next call to va_arg CGF.Builder.CreateStore(NewStack, stack_p); if (CGF.CGM.getDataLayout().isBigEndian() && !isAggregateTypeForABI(Ty) && Ctx.getTypeSize(Ty) < 64) { int Offset = 8 - Ctx.getTypeSize(Ty) / 8; OnStackAddr = CGF.Builder.CreatePtrToInt(OnStackAddr, CGF.Int64Ty); OnStackAddr = CGF.Builder.CreateAdd( OnStackAddr, llvm::ConstantInt::get(CGF.Int64Ty, Offset), "align_be"); OnStackAddr = CGF.Builder.CreateIntToPtr(OnStackAddr, CGF.Int8PtrTy); } OnStackAddr = CGF.Builder.CreateBitCast(OnStackAddr, MemTy); CGF.EmitBranch(ContBlock); //======================================= // Tidy up //======================================= CGF.EmitBlock(ContBlock); llvm::PHINode *ResAddr = CGF.Builder.CreatePHI(MemTy, 2, "vaarg.addr"); ResAddr->addIncoming(RegAddr, InRegBlock); ResAddr->addIncoming(OnStackAddr, OnStackBlock); if (IsIndirect) return CGF.Builder.CreateLoad(ResAddr, "vaarg.addr"); return ResAddr; } llvm::Value *AArch64ABIInfo::EmitDarwinVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { // We do not support va_arg for aggregates or illegal vector types. // Lower VAArg here for these cases and use the LLVM va_arg instruction for // other cases. if (!isAggregateTypeForABI(Ty) && !isIllegalVectorType(Ty)) return nullptr; uint64_t Size = CGF.getContext().getTypeSize(Ty) / 8; uint64_t Align = CGF.getContext().getTypeAlign(Ty) / 8; const Type *Base = nullptr; uint64_t Members = 0; bool isHA = isHomogeneousAggregate(Ty, Base, Members); bool isIndirect = false; // Arguments bigger than 16 bytes which aren't homogeneous aggregates should // be passed indirectly. if (Size > 16 && !isHA) { isIndirect = true; Size = 8; Align = 8; } llvm::Type *BP = llvm::Type::getInt8PtrTy(CGF.getLLVMContext()); llvm::Type *BPP = llvm::PointerType::getUnqual(BP); CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); if (isEmptyRecord(getContext(), Ty, true)) { // These are ignored for parameter passing purposes. llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); return Builder.CreateBitCast(Addr, PTy); } const uint64_t MinABIAlign = 8; if (Align > MinABIAlign) { llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int32Ty, Align - 1); Addr = Builder.CreateGEP(Addr, Offset); llvm::Value *AsInt = Builder.CreatePtrToInt(Addr, CGF.Int64Ty); llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, ~(Align - 1)); llvm::Value *Aligned = Builder.CreateAnd(AsInt, Mask); Addr = Builder.CreateIntToPtr(Aligned, BP, "ap.align"); } uint64_t Offset = llvm::RoundUpToAlignment(Size, MinABIAlign); llvm::Value *NextAddr = Builder.CreateGEP( Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); if (isIndirect) Addr = Builder.CreateLoad(Builder.CreateBitCast(Addr, BPP)); llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy); return AddrTyped; } //===----------------------------------------------------------------------===// // ARM ABI Implementation //===----------------------------------------------------------------------===// namespace { class ARMABIInfo : public ABIInfo { public: enum ABIKind { APCS = 0, AAPCS = 1, AAPCS_VFP }; private: ABIKind Kind; mutable int VFPRegs[16]; const unsigned NumVFPs; const unsigned NumGPRs; mutable unsigned AllocatedGPRs; mutable unsigned AllocatedVFPs; public: ARMABIInfo(CodeGenTypes &CGT, ABIKind _Kind) : ABIInfo(CGT), Kind(_Kind), NumVFPs(16), NumGPRs(4) { setCCs(); resetAllocatedRegs(); } bool isEABI() const { switch (getTarget().getTriple().getEnvironment()) { case llvm::Triple::Android: case llvm::Triple::EABI: case llvm::Triple::EABIHF: case llvm::Triple::GNUEABI: case llvm::Triple::GNUEABIHF: return true; default: return false; } } bool isEABIHF() const { switch (getTarget().getTriple().getEnvironment()) { case llvm::Triple::EABIHF: case llvm::Triple::GNUEABIHF: return true; default: return false; } } ABIKind getABIKind() const { return Kind; } private: ABIArgInfo classifyReturnType(QualType RetTy, bool isVariadic) const; ABIArgInfo classifyArgumentType(QualType RetTy, bool isVariadic, bool &IsCPRC) const; bool isIllegalVectorType(QualType Ty) const; bool isHomogeneousAggregateBaseType(QualType Ty) const override; bool isHomogeneousAggregateSmallEnough(const Type *Ty, uint64_t Members) const override; void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; llvm::CallingConv::ID getLLVMDefaultCC() const; llvm::CallingConv::ID getABIDefaultCC() const; void setCCs(); void markAllocatedGPRs(unsigned Alignment, unsigned NumRequired) const; void markAllocatedVFPs(unsigned Alignment, unsigned NumRequired) const; void resetAllocatedRegs(void) const; }; class ARMTargetCodeGenInfo : public TargetCodeGenInfo { public: ARMTargetCodeGenInfo(CodeGenTypes &CGT, ARMABIInfo::ABIKind K) :TargetCodeGenInfo(new ARMABIInfo(CGT, K)) {} const ARMABIInfo &getABIInfo() const { return static_cast(TargetCodeGenInfo::getABIInfo()); } int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override { return 13; } StringRef getARCRetainAutoreleasedReturnValueMarker() const override { return "mov\tr7, r7\t\t@ marker for objc_retainAutoreleaseReturnValue"; } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override { llvm::Value *Four8 = llvm::ConstantInt::get(CGF.Int8Ty, 4); // 0-15 are the 16 integer registers. AssignToArrayRange(CGF.Builder, Address, Four8, 0, 15); return false; } unsigned getSizeOfUnwindException() const override { if (getABIInfo().isEABI()) return 88; return TargetCodeGenInfo::getSizeOfUnwindException(); } void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const override { const FunctionDecl *FD = dyn_cast(D); if (!FD) return; const ARMInterruptAttr *Attr = FD->getAttr(); if (!Attr) return; const char *Kind; switch (Attr->getInterrupt()) { case ARMInterruptAttr::Generic: Kind = ""; break; case ARMInterruptAttr::IRQ: Kind = "IRQ"; break; case ARMInterruptAttr::FIQ: Kind = "FIQ"; break; case ARMInterruptAttr::SWI: Kind = "SWI"; break; case ARMInterruptAttr::ABORT: Kind = "ABORT"; break; case ARMInterruptAttr::UNDEF: Kind = "UNDEF"; break; } llvm::Function *Fn = cast(GV); Fn->addFnAttr("interrupt", Kind); if (cast(getABIInfo()).getABIKind() == ARMABIInfo::APCS) return; // AAPCS guarantees that sp will be 8-byte aligned on any public interface, // however this is not necessarily true on taking any interrupt. Instruct // the backend to perform a realignment as part of the function prologue. llvm::AttrBuilder B; B.addStackAlignmentAttr(8); Fn->addAttributes(llvm::AttributeSet::FunctionIndex, llvm::AttributeSet::get(CGM.getLLVMContext(), llvm::AttributeSet::FunctionIndex, B)); } }; } void ARMABIInfo::computeInfo(CGFunctionInfo &FI) const { // To correctly handle Homogeneous Aggregate, we need to keep track of the // VFP registers allocated so far. // C.1.vfp If the argument is a VFP CPRC and there are sufficient consecutive // VFP registers of the appropriate type unallocated then the argument is // allocated to the lowest-numbered sequence of such registers. // C.2.vfp If the argument is a VFP CPRC then any VFP registers that are // unallocated are marked as unavailable. resetAllocatedRegs(); if (getCXXABI().classifyReturnType(FI)) { if (FI.getReturnInfo().isIndirect()) markAllocatedGPRs(1, 1); } else { FI.getReturnInfo() = classifyReturnType(FI.getReturnType(), FI.isVariadic()); } for (auto &I : FI.arguments()) { unsigned PreAllocationVFPs = AllocatedVFPs; unsigned PreAllocationGPRs = AllocatedGPRs; bool IsCPRC = false; // 6.1.2.3 There is one VFP co-processor register class using registers // s0-s15 (d0-d7) for passing arguments. I.info = classifyArgumentType(I.type, FI.isVariadic(), IsCPRC); // If we have allocated some arguments onto the stack (due to running // out of VFP registers), we cannot split an argument between GPRs and // the stack. If this situation occurs, we add padding to prevent the // GPRs from being used. In this situation, the current argument could // only be allocated by rule C.8, so rule C.6 would mark these GPRs as // unusable anyway. // We do not have to do this if the argument is being passed ByVal, as the // backend can handle that situation correctly. const bool StackUsed = PreAllocationGPRs > NumGPRs || PreAllocationVFPs > NumVFPs; const bool IsByVal = I.info.isIndirect() && I.info.getIndirectByVal(); if (!IsCPRC && PreAllocationGPRs < NumGPRs && AllocatedGPRs > NumGPRs && StackUsed && !IsByVal) { llvm::Type *PaddingTy = llvm::ArrayType::get( llvm::Type::getInt32Ty(getVMContext()), NumGPRs - PreAllocationGPRs); if (I.info.canHaveCoerceToType()) { I.info = ABIArgInfo::getDirect(I.info.getCoerceToType() /* type */, 0 /* offset */, PaddingTy, true); } else { I.info = ABIArgInfo::getDirect(nullptr /* type */, 0 /* offset */, PaddingTy, true); } } } // Always honor user-specified calling convention. if (FI.getCallingConvention() != llvm::CallingConv::C) return; llvm::CallingConv::ID cc = getRuntimeCC(); if (cc != llvm::CallingConv::C) FI.setEffectiveCallingConvention(cc); } /// Return the default calling convention that LLVM will use. llvm::CallingConv::ID ARMABIInfo::getLLVMDefaultCC() const { // The default calling convention that LLVM will infer. if (isEABIHF()) return llvm::CallingConv::ARM_AAPCS_VFP; else if (isEABI()) return llvm::CallingConv::ARM_AAPCS; else return llvm::CallingConv::ARM_APCS; } /// Return the calling convention that our ABI would like us to use /// as the C calling convention. llvm::CallingConv::ID ARMABIInfo::getABIDefaultCC() const { switch (getABIKind()) { case APCS: return llvm::CallingConv::ARM_APCS; case AAPCS: return llvm::CallingConv::ARM_AAPCS; case AAPCS_VFP: return llvm::CallingConv::ARM_AAPCS_VFP; } llvm_unreachable("bad ABI kind"); } void ARMABIInfo::setCCs() { assert(getRuntimeCC() == llvm::CallingConv::C); // Don't muddy up the IR with a ton of explicit annotations if // they'd just match what LLVM will infer from the triple. llvm::CallingConv::ID abiCC = getABIDefaultCC(); if (abiCC != getLLVMDefaultCC()) RuntimeCC = abiCC; BuiltinCC = (getABIKind() == APCS ? llvm::CallingConv::ARM_APCS : llvm::CallingConv::ARM_AAPCS); } /// markAllocatedVFPs - update VFPRegs according to the alignment and /// number of VFP registers (unit is S register) requested. void ARMABIInfo::markAllocatedVFPs(unsigned Alignment, unsigned NumRequired) const { // Early Exit. if (AllocatedVFPs >= 16) { // We use AllocatedVFP > 16 to signal that some CPRCs were allocated on // the stack. AllocatedVFPs = 17; return; } // C.1.vfp If the argument is a VFP CPRC and there are sufficient consecutive // VFP registers of the appropriate type unallocated then the argument is // allocated to the lowest-numbered sequence of such registers. for (unsigned I = 0; I < 16; I += Alignment) { bool FoundSlot = true; for (unsigned J = I, JEnd = I + NumRequired; J < JEnd; J++) if (J >= 16 || VFPRegs[J]) { FoundSlot = false; break; } if (FoundSlot) { for (unsigned J = I, JEnd = I + NumRequired; J < JEnd; J++) VFPRegs[J] = 1; AllocatedVFPs += NumRequired; return; } } // C.2.vfp If the argument is a VFP CPRC then any VFP registers that are // unallocated are marked as unavailable. for (unsigned I = 0; I < 16; I++) VFPRegs[I] = 1; AllocatedVFPs = 17; // We do not have enough VFP registers. } /// Update AllocatedGPRs to record the number of general purpose registers /// which have been allocated. It is valid for AllocatedGPRs to go above 4, /// this represents arguments being stored on the stack. void ARMABIInfo::markAllocatedGPRs(unsigned Alignment, unsigned NumRequired) const { assert((Alignment == 1 || Alignment == 2) && "Alignment must be 4 or 8 bytes"); if (Alignment == 2 && AllocatedGPRs & 0x1) AllocatedGPRs += 1; AllocatedGPRs += NumRequired; } void ARMABIInfo::resetAllocatedRegs(void) const { AllocatedGPRs = 0; AllocatedVFPs = 0; for (unsigned i = 0; i < NumVFPs; ++i) VFPRegs[i] = 0; } ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic, bool &IsCPRC) const { // We update number of allocated VFPs according to // 6.1.2.1 The following argument types are VFP CPRCs: // A single-precision floating-point type (including promoted // half-precision types); A double-precision floating-point type; // A 64-bit or 128-bit containerized vector type; Homogeneous Aggregate // with a Base Type of a single- or double-precision floating-point type, // 64-bit containerized vectors or 128-bit containerized vectors with one // to four Elements. bool IsEffectivelyAAPCS_VFP = getABIKind() == AAPCS_VFP && !isVariadic; Ty = useFirstFieldIfTransparentUnion(Ty); // Handle illegal vector types here. if (isIllegalVectorType(Ty)) { uint64_t Size = getContext().getTypeSize(Ty); if (Size <= 32) { llvm::Type *ResType = llvm::Type::getInt32Ty(getVMContext()); markAllocatedGPRs(1, 1); return ABIArgInfo::getDirect(ResType); } if (Size == 64) { llvm::Type *ResType = llvm::VectorType::get( llvm::Type::getInt32Ty(getVMContext()), 2); if (getABIKind() == ARMABIInfo::AAPCS || isVariadic){ markAllocatedGPRs(2, 2); } else { markAllocatedVFPs(2, 2); IsCPRC = true; } return ABIArgInfo::getDirect(ResType); } if (Size == 128) { llvm::Type *ResType = llvm::VectorType::get( llvm::Type::getInt32Ty(getVMContext()), 4); if (getABIKind() == ARMABIInfo::AAPCS || isVariadic) { markAllocatedGPRs(2, 4); } else { markAllocatedVFPs(4, 4); IsCPRC = true; } return ABIArgInfo::getDirect(ResType); } markAllocatedGPRs(1, 1); return ABIArgInfo::getIndirect(0, /*ByVal=*/false); } // Update VFPRegs for legal vector types. if (getABIKind() == ARMABIInfo::AAPCS_VFP && !isVariadic) { if (const VectorType *VT = Ty->getAs()) { uint64_t Size = getContext().getTypeSize(VT); // Size of a legal vector should be power of 2 and above 64. markAllocatedVFPs(Size >= 128 ? 4 : 2, Size / 32); IsCPRC = true; } } // Update VFPRegs for floating point types. if (getABIKind() == ARMABIInfo::AAPCS_VFP && !isVariadic) { if (const BuiltinType *BT = Ty->getAs()) { if (BT->getKind() == BuiltinType::Half || BT->getKind() == BuiltinType::Float) { markAllocatedVFPs(1, 1); IsCPRC = true; } if (BT->getKind() == BuiltinType::Double || BT->getKind() == BuiltinType::LongDouble) { markAllocatedVFPs(2, 2); IsCPRC = true; } } } if (!isAggregateTypeForABI(Ty)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) { Ty = EnumTy->getDecl()->getIntegerType(); } unsigned Size = getContext().getTypeSize(Ty); if (!IsCPRC) markAllocatedGPRs(Size > 32 ? 2 : 1, (Size + 31) / 32); return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) { markAllocatedGPRs(1, 1); return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); } // Ignore empty records. if (isEmptyRecord(getContext(), Ty, true)) return ABIArgInfo::getIgnore(); if (IsEffectivelyAAPCS_VFP) { // Homogeneous Aggregates need to be expanded when we can fit the aggregate // into VFP registers. const Type *Base = nullptr; uint64_t Members = 0; if (isHomogeneousAggregate(Ty, Base, Members)) { assert(Base && "Base class should be set for homogeneous aggregate"); // Base can be a floating-point or a vector. if (Base->isVectorType()) { // ElementSize is in number of floats. unsigned ElementSize = getContext().getTypeSize(Base) == 64 ? 2 : 4; markAllocatedVFPs(ElementSize, Members * ElementSize); } else if (Base->isSpecificBuiltinType(BuiltinType::Float)) markAllocatedVFPs(1, Members); else { assert(Base->isSpecificBuiltinType(BuiltinType::Double) || Base->isSpecificBuiltinType(BuiltinType::LongDouble)); markAllocatedVFPs(2, Members * 2); } IsCPRC = true; return ABIArgInfo::getDirect(nullptr, 0, nullptr, false); } } // Support byval for ARM. // The ABI alignment for APCS is 4-byte and for AAPCS at least 4-byte and at // most 8-byte. We realign the indirect argument if type alignment is bigger // than ABI alignment. uint64_t ABIAlign = 4; uint64_t TyAlign = getContext().getTypeAlign(Ty) / 8; if (getABIKind() == ARMABIInfo::AAPCS_VFP || getABIKind() == ARMABIInfo::AAPCS) ABIAlign = std::min(std::max(TyAlign, (uint64_t)4), (uint64_t)8); if (getContext().getTypeSizeInChars(Ty) > CharUnits::fromQuantity(64)) { // Update Allocated GPRs. Since this is only used when the size of the // argument is greater than 64 bytes, this will always use up any available // registers (of which there are 4). We also don't care about getting the // alignment right, because general-purpose registers cannot be back-filled. markAllocatedGPRs(1, 4); return ABIArgInfo::getIndirect(TyAlign, /*ByVal=*/true, /*Realign=*/TyAlign > ABIAlign); } // Otherwise, pass by coercing to a structure of the appropriate size. llvm::Type* ElemTy; unsigned SizeRegs; // FIXME: Try to match the types of the arguments more accurately where // we can. if (getContext().getTypeAlign(Ty) <= 32) { ElemTy = llvm::Type::getInt32Ty(getVMContext()); SizeRegs = (getContext().getTypeSize(Ty) + 31) / 32; markAllocatedGPRs(1, SizeRegs); } else { ElemTy = llvm::Type::getInt64Ty(getVMContext()); SizeRegs = (getContext().getTypeSize(Ty) + 63) / 64; markAllocatedGPRs(2, SizeRegs * 2); } return ABIArgInfo::getDirect(llvm::ArrayType::get(ElemTy, SizeRegs)); } static bool isIntegerLikeType(QualType Ty, ASTContext &Context, llvm::LLVMContext &VMContext) { // APCS, C Language Calling Conventions, Non-Simple Return Values: A structure // is called integer-like if its size is less than or equal to one word, and // the offset of each of its addressable sub-fields is zero. uint64_t Size = Context.getTypeSize(Ty); // Check that the type fits in a word. if (Size > 32) return false; // FIXME: Handle vector types! if (Ty->isVectorType()) return false; // Float types are never treated as "integer like". if (Ty->isRealFloatingType()) return false; // If this is a builtin or pointer type then it is ok. if (Ty->getAs() || Ty->isPointerType()) return true; // Small complex integer types are "integer like". if (const ComplexType *CT = Ty->getAs()) return isIntegerLikeType(CT->getElementType(), Context, VMContext); // Single element and zero sized arrays should be allowed, by the definition // above, but they are not. // Otherwise, it must be a record type. const RecordType *RT = Ty->getAs(); if (!RT) return false; // Ignore records with flexible arrays. const RecordDecl *RD = RT->getDecl(); if (RD->hasFlexibleArrayMember()) return false; // Check that all sub-fields are at offset 0, and are themselves "integer // like". const ASTRecordLayout &Layout = Context.getASTRecordLayout(RD); bool HadField = false; unsigned idx = 0; for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end(); i != e; ++i, ++idx) { const FieldDecl *FD = *i; // Bit-fields are not addressable, we only need to verify they are "integer // like". We still have to disallow a subsequent non-bitfield, for example: // struct { int : 0; int x } // is non-integer like according to gcc. if (FD->isBitField()) { if (!RD->isUnion()) HadField = true; if (!isIntegerLikeType(FD->getType(), Context, VMContext)) return false; continue; } // Check if this field is at offset 0. if (Layout.getFieldOffset(idx) != 0) return false; if (!isIntegerLikeType(FD->getType(), Context, VMContext)) return false; // Only allow at most one field in a structure. This doesn't match the // wording above, but follows gcc in situations with a field following an // empty structure. if (!RD->isUnion()) { if (HadField) return false; HadField = true; } } return true; } ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic) const { bool IsEffectivelyAAPCS_VFP = getABIKind() == AAPCS_VFP && !isVariadic; if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); // Large vector types should be returned via memory. if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) { markAllocatedGPRs(1, 1); return ABIArgInfo::getIndirect(0); } if (!isAggregateTypeForABI(RetTy)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect(); } // Are we following APCS? if (getABIKind() == APCS) { if (isEmptyRecord(getContext(), RetTy, false)) return ABIArgInfo::getIgnore(); // Complex types are all returned as packed integers. // // FIXME: Consider using 2 x vector types if the back end handles them // correctly. if (RetTy->isAnyComplexType()) return ABIArgInfo::getDirect(llvm::IntegerType::get( getVMContext(), getContext().getTypeSize(RetTy))); // Integer like structures are returned in r0. if (isIntegerLikeType(RetTy, getContext(), getVMContext())) { // Return in the smallest viable integer type. uint64_t Size = getContext().getTypeSize(RetTy); if (Size <= 8) return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext())); if (Size <= 16) return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); } // Otherwise return in memory. markAllocatedGPRs(1, 1); return ABIArgInfo::getIndirect(0); } // Otherwise this is an AAPCS variant. if (isEmptyRecord(getContext(), RetTy, true)) return ABIArgInfo::getIgnore(); // Check for homogeneous aggregates with AAPCS-VFP. if (IsEffectivelyAAPCS_VFP) { const Type *Base = nullptr; uint64_t Members; if (isHomogeneousAggregate(RetTy, Base, Members)) { assert(Base && "Base class should be set for homogeneous aggregate"); // Homogeneous Aggregates are returned directly. return ABIArgInfo::getDirect(nullptr, 0, nullptr, false); } } // Aggregates <= 4 bytes are returned in r0; other aggregates // are returned indirectly. uint64_t Size = getContext().getTypeSize(RetTy); if (Size <= 32) { if (getDataLayout().isBigEndian()) // Return in 32 bit integer integer type (as if loaded by LDR, AAPCS 5.4) return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); // Return in the smallest viable integer type. if (Size <= 8) return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext())); if (Size <= 16) return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); } markAllocatedGPRs(1, 1); return ABIArgInfo::getIndirect(0); } /// isIllegalVector - check whether Ty is an illegal vector type. bool ARMABIInfo::isIllegalVectorType(QualType Ty) const { if (const VectorType *VT = Ty->getAs()) { // Check whether VT is legal. unsigned NumElements = VT->getNumElements(); uint64_t Size = getContext().getTypeSize(VT); // NumElements should be power of 2. if ((NumElements & (NumElements - 1)) != 0) return true; // Size should be greater than 32 bits. return Size <= 32; } return false; } bool ARMABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { // Homogeneous aggregates for AAPCS-VFP must have base types of float, // double, or 64-bit or 128-bit vectors. if (const BuiltinType *BT = Ty->getAs()) { if (BT->getKind() == BuiltinType::Float || BT->getKind() == BuiltinType::Double || BT->getKind() == BuiltinType::LongDouble) return true; } else if (const VectorType *VT = Ty->getAs()) { unsigned VecSize = getContext().getTypeSize(VT); if (VecSize == 64 || VecSize == 128) return true; } return false; } bool ARMABIInfo::isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const { return Members <= 4; } llvm::Value *ARMABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { llvm::Type *BP = CGF.Int8PtrTy; llvm::Type *BPP = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); if (isEmptyRecord(getContext(), Ty, true)) { // These are ignored for parameter passing purposes. llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); return Builder.CreateBitCast(Addr, PTy); } uint64_t Size = CGF.getContext().getTypeSize(Ty) / 8; uint64_t TyAlign = CGF.getContext().getTypeAlign(Ty) / 8; bool IsIndirect = false; // The ABI alignment for 64-bit or 128-bit vectors is 8 for AAPCS and 4 for // APCS. For AAPCS, the ABI alignment is at least 4-byte and at most 8-byte. if (getABIKind() == ARMABIInfo::AAPCS_VFP || getABIKind() == ARMABIInfo::AAPCS) TyAlign = std::min(std::max(TyAlign, (uint64_t)4), (uint64_t)8); else TyAlign = 4; // Use indirect if size of the illegal vector is bigger than 16 bytes. if (isIllegalVectorType(Ty) && Size > 16) { IsIndirect = true; Size = 4; TyAlign = 4; } // Handle address alignment for ABI alignment > 4 bytes. if (TyAlign > 4) { assert((TyAlign & (TyAlign - 1)) == 0 && "Alignment is not power of 2!"); llvm::Value *AddrAsInt = Builder.CreatePtrToInt(Addr, CGF.Int32Ty); AddrAsInt = Builder.CreateAdd(AddrAsInt, Builder.getInt32(TyAlign - 1)); AddrAsInt = Builder.CreateAnd(AddrAsInt, Builder.getInt32(~(TyAlign - 1))); Addr = Builder.CreateIntToPtr(AddrAsInt, BP, "ap.align"); } uint64_t Offset = llvm::RoundUpToAlignment(Size, 4); llvm::Value *NextAddr = Builder.CreateGEP(Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); if (IsIndirect) Addr = Builder.CreateLoad(Builder.CreateBitCast(Addr, BPP)); else if (TyAlign < CGF.getContext().getTypeAlign(Ty) / 8) { // We can't directly cast ap.cur to pointer to a vector type, since ap.cur // may not be correctly aligned for the vector type. We create an aligned // temporary space and copy the content over from ap.cur to the temporary // space. This is necessary if the natural alignment of the type is greater // than the ABI alignment. llvm::Type *I8PtrTy = Builder.getInt8PtrTy(); CharUnits CharSize = getContext().getTypeSizeInChars(Ty); llvm::Value *AlignedTemp = CGF.CreateTempAlloca(CGF.ConvertType(Ty), "var.align"); llvm::Value *Dst = Builder.CreateBitCast(AlignedTemp, I8PtrTy); llvm::Value *Src = Builder.CreateBitCast(Addr, I8PtrTy); Builder.CreateMemCpy(Dst, Src, llvm::ConstantInt::get(CGF.IntPtrTy, CharSize.getQuantity()), TyAlign, false); Addr = AlignedTemp; //The content is in aligned location. } llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy); return AddrTyped; } namespace { class NaClARMABIInfo : public ABIInfo { public: NaClARMABIInfo(CodeGen::CodeGenTypes &CGT, ARMABIInfo::ABIKind Kind) : ABIInfo(CGT), PInfo(CGT), NInfo(CGT, Kind) {} void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; private: PNaClABIInfo PInfo; // Used for generating calls with pnaclcall callingconv. ARMABIInfo NInfo; // Used for everything else. }; class NaClARMTargetCodeGenInfo : public TargetCodeGenInfo { public: NaClARMTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, ARMABIInfo::ABIKind Kind) : TargetCodeGenInfo(new NaClARMABIInfo(CGT, Kind)) {} }; } void NaClARMABIInfo::computeInfo(CGFunctionInfo &FI) const { if (FI.getASTCallingConvention() == CC_PnaclCall) PInfo.computeInfo(FI); else static_cast(NInfo).computeInfo(FI); } llvm::Value *NaClARMABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { // Always use the native convention; calling pnacl-style varargs functions // is unsupported. return static_cast(NInfo).EmitVAArg(VAListAddr, Ty, CGF); } //===----------------------------------------------------------------------===// // NVPTX ABI Implementation //===----------------------------------------------------------------------===// namespace { class NVPTXABIInfo : public ABIInfo { public: NVPTXABIInfo(CodeGenTypes &CGT) : ABIInfo(CGT) {} ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType Ty) const; void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CFG) const override; }; class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo { public: NVPTXTargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(new NVPTXABIInfo(CGT)) {} void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; private: // Adds a NamedMDNode with F, Name, and Operand as operands, and adds the // resulting MDNode to the nvvm.annotations MDNode. static void addNVVMMetadata(llvm::Function *F, StringRef Name, int Operand); }; ABIArgInfo NVPTXABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); // note: this is different from default ABI if (!RetTy->isScalarType()) return ABIArgInfo::getDirect(); // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType Ty) const { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); // Return aggregates type as indirect by value if (isAggregateTypeForABI(Ty)) return ABIArgInfo::getIndirect(0, /* byval */ true); return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &I : FI.arguments()) I.info = classifyArgumentType(I.type); // Always honor user-specified calling convention. if (FI.getCallingConvention() != llvm::CallingConv::C) return; FI.setEffectiveCallingConvention(getRuntimeCC()); } llvm::Value *NVPTXABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CFG) const { llvm_unreachable("NVPTX does not support varargs"); } void NVPTXTargetCodeGenInfo:: SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const{ const FunctionDecl *FD = dyn_cast(D); if (!FD) return; llvm::Function *F = cast(GV); // Perform special handling in OpenCL mode if (M.getLangOpts().OpenCL) { // Use OpenCL function attributes to check for kernel functions // By default, all functions are device functions if (FD->hasAttr()) { // OpenCL __kernel functions get kernel metadata // Create !{, metadata !"kernel", i32 1} node addNVVMMetadata(F, "kernel", 1); // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); } } // Perform special handling in CUDA mode. if (M.getLangOpts().CUDA) { // CUDA __global__ functions get a kernel metadata entry. Since // __global__ functions cannot be called from the device, we do not // need to set the noinline attribute. if (FD->hasAttr()) { // Create !{, metadata !"kernel", i32 1} node addNVVMMetadata(F, "kernel", 1); } if (FD->hasAttr()) { // Create !{, metadata !"maxntidx", i32 } node addNVVMMetadata(F, "maxntidx", FD->getAttr()->getMaxThreads()); // min blocks is a default argument for CUDALaunchBoundsAttr, so getting a // zero value from getMinBlocks either means it was not specified in // __launch_bounds__ or the user specified a 0 value. In both cases, we // don't have to add a PTX directive. int MinCTASM = FD->getAttr()->getMinBlocks(); if (MinCTASM > 0) { // Create !{, metadata !"minctasm", i32 } node addNVVMMetadata(F, "minctasm", MinCTASM); } } } } void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::Function *F, StringRef Name, int Operand) { llvm::Module *M = F->getParent(); llvm::LLVMContext &Ctx = M->getContext(); // Get "nvvm.annotations" metadata node llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); llvm::Metadata *MDVals[] = { llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, Name), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))}; // Append metadata to nvvm.annotations MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } } //===----------------------------------------------------------------------===// // SystemZ ABI Implementation //===----------------------------------------------------------------------===// namespace { class SystemZABIInfo : public ABIInfo { public: SystemZABIInfo(CodeGenTypes &CGT) : ABIInfo(CGT) {} bool isPromotableIntegerType(QualType Ty) const; bool isCompoundType(QualType Ty) const; bool isFPArgumentType(QualType Ty) const; ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType ArgTy) const; void computeInfo(CGFunctionInfo &FI) const override { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &I : FI.arguments()) I.info = classifyArgumentType(I.type); } llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class SystemZTargetCodeGenInfo : public TargetCodeGenInfo { public: SystemZTargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(new SystemZABIInfo(CGT)) {} }; } bool SystemZABIInfo::isPromotableIntegerType(QualType Ty) const { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); // Promotable integer types are required to be promoted by the ABI. if (Ty->isPromotableIntegerType()) return true; // 32-bit values must also be promoted. if (const BuiltinType *BT = Ty->getAs()) switch (BT->getKind()) { case BuiltinType::Int: case BuiltinType::UInt: return true; default: return false; } return false; } bool SystemZABIInfo::isCompoundType(QualType Ty) const { return Ty->isAnyComplexType() || isAggregateTypeForABI(Ty); } bool SystemZABIInfo::isFPArgumentType(QualType Ty) const { if (const BuiltinType *BT = Ty->getAs()) switch (BT->getKind()) { case BuiltinType::Float: case BuiltinType::Double: return true; default: return false; } if (const RecordType *RT = Ty->getAsStructureType()) { const RecordDecl *RD = RT->getDecl(); bool Found = false; // If this is a C++ record, check the bases first. if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) for (const auto &I : CXXRD->bases()) { QualType Base = I.getType(); // Empty bases don't affect things either way. if (isEmptyRecord(getContext(), Base, true)) continue; if (Found) return false; Found = isFPArgumentType(Base); if (!Found) return false; } // Check the fields. for (const auto *FD : RD->fields()) { // Empty bitfields don't affect things either way. // Unlike isSingleElementStruct(), empty structure and array fields // do count. So do anonymous bitfields that aren't zero-sized. if (FD->isBitField() && FD->getBitWidthValue(getContext()) == 0) return true; // Unlike isSingleElementStruct(), arrays do not count. // Nested isFPArgumentType structures still do though. if (Found) return false; Found = isFPArgumentType(FD->getType()); if (!Found) return false; } // Unlike isSingleElementStruct(), trailing padding is allowed. // An 8-byte aligned struct s { float f; } is passed as a double. return Found; } return false; } llvm::Value *SystemZABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { // Assume that va_list type is correct; should be pointer to LLVM type: // struct { // i64 __gpr; // i64 __fpr; // i8 *__overflow_arg_area; // i8 *__reg_save_area; // }; // Every argument occupies 8 bytes and is passed by preference in either // GPRs or FPRs. Ty = CGF.getContext().getCanonicalType(Ty); ABIArgInfo AI = classifyArgumentType(Ty); bool InFPRs = isFPArgumentType(Ty); llvm::Type *APTy = llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty)); bool IsIndirect = AI.isIndirect(); unsigned UnpaddedBitSize; if (IsIndirect) { APTy = llvm::PointerType::getUnqual(APTy); UnpaddedBitSize = 64; } else UnpaddedBitSize = getContext().getTypeSize(Ty); unsigned PaddedBitSize = 64; assert((UnpaddedBitSize <= PaddedBitSize) && "Invalid argument size."); unsigned PaddedSize = PaddedBitSize / 8; unsigned Padding = (PaddedBitSize - UnpaddedBitSize) / 8; unsigned MaxRegs, RegCountField, RegSaveIndex, RegPadding; if (InFPRs) { MaxRegs = 4; // Maximum of 4 FPR arguments RegCountField = 1; // __fpr RegSaveIndex = 16; // save offset for f0 RegPadding = 0; // floats are passed in the high bits of an FPR } else { MaxRegs = 5; // Maximum of 5 GPR arguments RegCountField = 0; // __gpr RegSaveIndex = 2; // save offset for r2 RegPadding = Padding; // values are passed in the low bits of a GPR } llvm::Value *RegCountPtr = CGF.Builder.CreateStructGEP(VAListAddr, RegCountField, "reg_count_ptr"); llvm::Value *RegCount = CGF.Builder.CreateLoad(RegCountPtr, "reg_count"); llvm::Type *IndexTy = RegCount->getType(); llvm::Value *MaxRegsV = llvm::ConstantInt::get(IndexTy, MaxRegs); llvm::Value *InRegs = CGF.Builder.CreateICmpULT(RegCount, MaxRegsV, "fits_in_regs"); llvm::BasicBlock *InRegBlock = CGF.createBasicBlock("vaarg.in_reg"); llvm::BasicBlock *InMemBlock = CGF.createBasicBlock("vaarg.in_mem"); llvm::BasicBlock *ContBlock = CGF.createBasicBlock("vaarg.end"); CGF.Builder.CreateCondBr(InRegs, InRegBlock, InMemBlock); // Emit code to load the value if it was passed in registers. CGF.EmitBlock(InRegBlock); // Work out the address of an argument register. llvm::Value *PaddedSizeV = llvm::ConstantInt::get(IndexTy, PaddedSize); llvm::Value *ScaledRegCount = CGF.Builder.CreateMul(RegCount, PaddedSizeV, "scaled_reg_count"); llvm::Value *RegBase = llvm::ConstantInt::get(IndexTy, RegSaveIndex * PaddedSize + RegPadding); llvm::Value *RegOffset = CGF.Builder.CreateAdd(ScaledRegCount, RegBase, "reg_offset"); llvm::Value *RegSaveAreaPtr = CGF.Builder.CreateStructGEP(VAListAddr, 3, "reg_save_area_ptr"); llvm::Value *RegSaveArea = CGF.Builder.CreateLoad(RegSaveAreaPtr, "reg_save_area"); llvm::Value *RawRegAddr = CGF.Builder.CreateGEP(RegSaveArea, RegOffset, "raw_reg_addr"); llvm::Value *RegAddr = CGF.Builder.CreateBitCast(RawRegAddr, APTy, "reg_addr"); // Update the register count llvm::Value *One = llvm::ConstantInt::get(IndexTy, 1); llvm::Value *NewRegCount = CGF.Builder.CreateAdd(RegCount, One, "reg_count"); CGF.Builder.CreateStore(NewRegCount, RegCountPtr); CGF.EmitBranch(ContBlock); // Emit code to load the value if it was passed in memory. CGF.EmitBlock(InMemBlock); // Work out the address of a stack argument. llvm::Value *OverflowArgAreaPtr = CGF.Builder.CreateStructGEP(VAListAddr, 2, "overflow_arg_area_ptr"); llvm::Value *OverflowArgArea = CGF.Builder.CreateLoad(OverflowArgAreaPtr, "overflow_arg_area"); llvm::Value *PaddingV = llvm::ConstantInt::get(IndexTy, Padding); llvm::Value *RawMemAddr = CGF.Builder.CreateGEP(OverflowArgArea, PaddingV, "raw_mem_addr"); llvm::Value *MemAddr = CGF.Builder.CreateBitCast(RawMemAddr, APTy, "mem_addr"); // Update overflow_arg_area_ptr pointer llvm::Value *NewOverflowArgArea = CGF.Builder.CreateGEP(OverflowArgArea, PaddedSizeV, "overflow_arg_area"); CGF.Builder.CreateStore(NewOverflowArgArea, OverflowArgAreaPtr); CGF.EmitBranch(ContBlock); // Return the appropriate result. CGF.EmitBlock(ContBlock); llvm::PHINode *ResAddr = CGF.Builder.CreatePHI(APTy, 2, "va_arg.addr"); ResAddr->addIncoming(RegAddr, InRegBlock); ResAddr->addIncoming(MemAddr, InMemBlock); if (IsIndirect) return CGF.Builder.CreateLoad(ResAddr, "indirect_arg"); return ResAddr; } ABIArgInfo SystemZABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); if (isCompoundType(RetTy) || getContext().getTypeSize(RetTy) > 64) return ABIArgInfo::getIndirect(0); return (isPromotableIntegerType(RetTy) ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const { // Handle the generic C++ ABI. if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); // Integers and enums are extended to full register width. if (isPromotableIntegerType(Ty)) return ABIArgInfo::getExtend(); // Values that are not 1, 2, 4 or 8 bytes in size are passed indirectly. uint64_t Size = getContext().getTypeSize(Ty); if (Size != 8 && Size != 16 && Size != 32 && Size != 64) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // Handle small structures. if (const RecordType *RT = Ty->getAs()) { // Structures with flexible arrays have variable length, so really // fail the size test above. const RecordDecl *RD = RT->getDecl(); if (RD->hasFlexibleArrayMember()) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // The structure is passed as an unextended integer, a float, or a double. llvm::Type *PassTy; if (isFPArgumentType(Ty)) { assert(Size == 32 || Size == 64); if (Size == 32) PassTy = llvm::Type::getFloatTy(getVMContext()); else PassTy = llvm::Type::getDoubleTy(getVMContext()); } else PassTy = llvm::IntegerType::get(getVMContext(), Size); return ABIArgInfo::getDirect(PassTy); } // Non-structure compounds are passed indirectly. if (isCompoundType(Ty)) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); return ABIArgInfo::getDirect(nullptr); } //===----------------------------------------------------------------------===// // MSP430 ABI Implementation //===----------------------------------------------------------------------===// namespace { class MSP430TargetCodeGenInfo : public TargetCodeGenInfo { public: MSP430TargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {} void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; }; } void MSP430TargetCodeGenInfo::SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (const FunctionDecl *FD = dyn_cast(D)) { if (const MSP430InterruptAttr *attr = FD->getAttr()) { // Handle 'interrupt' attribute: llvm::Function *F = cast(GV); // Step 1: Set ISR calling convention. F->setCallingConv(llvm::CallingConv::MSP430_INTR); // Step 2: Add attributes goodness. F->addFnAttr(llvm::Attribute::NoInline); // Step 3: Emit ISR vector alias. unsigned Num = attr->getNumber() / 2; llvm::GlobalAlias::create(llvm::Function::ExternalLinkage, "__isr_" + Twine(Num), F); } } } //===----------------------------------------------------------------------===// // MIPS ABI Implementation. This works for both little-endian and // big-endian variants. //===----------------------------------------------------------------------===// namespace { class MipsABIInfo : public ABIInfo { bool IsO32; unsigned MinABIStackAlignInBytes, StackAlignInBytes; void CoerceToIntArgs(uint64_t TySize, SmallVectorImpl &ArgList) const; llvm::Type* HandleAggregates(QualType Ty, uint64_t TySize) const; llvm::Type* returnAggregateInRegs(QualType RetTy, uint64_t Size) const; llvm::Type* getPaddingType(uint64_t Align, uint64_t Offset) const; public: MipsABIInfo(CodeGenTypes &CGT, bool _IsO32) : ABIInfo(CGT), IsO32(_IsO32), MinABIStackAlignInBytes(IsO32 ? 4 : 8), StackAlignInBytes(IsO32 ? 8 : 16) {} ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType RetTy, uint64_t &Offset) const; void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class MIPSTargetCodeGenInfo : public TargetCodeGenInfo { unsigned SizeOfUnwindException; public: MIPSTargetCodeGenInfo(CodeGenTypes &CGT, bool IsO32) : TargetCodeGenInfo(new MipsABIInfo(CGT, IsO32)), SizeOfUnwindException(IsO32 ? 24 : 32) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override { return 29; } void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const override { const FunctionDecl *FD = dyn_cast(D); if (!FD) return; llvm::Function *Fn = cast(GV); if (FD->hasAttr()) { Fn->addFnAttr("mips16"); } else if (FD->hasAttr()) { Fn->addFnAttr("nomips16"); } } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override; unsigned getSizeOfUnwindException() const override { return SizeOfUnwindException; } }; } void MipsABIInfo::CoerceToIntArgs(uint64_t TySize, SmallVectorImpl &ArgList) const { llvm::IntegerType *IntTy = llvm::IntegerType::get(getVMContext(), MinABIStackAlignInBytes * 8); // Add (TySize / MinABIStackAlignInBytes) args of IntTy. for (unsigned N = TySize / (MinABIStackAlignInBytes * 8); N; --N) ArgList.push_back(IntTy); // If necessary, add one more integer type to ArgList. unsigned R = TySize % (MinABIStackAlignInBytes * 8); if (R) ArgList.push_back(llvm::IntegerType::get(getVMContext(), R)); } // In N32/64, an aligned double precision floating point field is passed in // a register. llvm::Type* MipsABIInfo::HandleAggregates(QualType Ty, uint64_t TySize) const { SmallVector ArgList, IntArgList; if (IsO32) { CoerceToIntArgs(TySize, ArgList); return llvm::StructType::get(getVMContext(), ArgList); } if (Ty->isComplexType()) return CGT.ConvertType(Ty); const RecordType *RT = Ty->getAs(); // Unions/vectors are passed in integer registers. if (!RT || !RT->isStructureOrClassType()) { CoerceToIntArgs(TySize, ArgList); return llvm::StructType::get(getVMContext(), ArgList); } const RecordDecl *RD = RT->getDecl(); const ASTRecordLayout &Layout = getContext().getASTRecordLayout(RD); assert(!(TySize % 8) && "Size of structure must be multiple of 8."); uint64_t LastOffset = 0; unsigned idx = 0; llvm::IntegerType *I64 = llvm::IntegerType::get(getVMContext(), 64); // Iterate over fields in the struct/class and check if there are any aligned // double fields. for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end(); i != e; ++i, ++idx) { const QualType Ty = i->getType(); const BuiltinType *BT = Ty->getAs(); if (!BT || BT->getKind() != BuiltinType::Double) continue; uint64_t Offset = Layout.getFieldOffset(idx); if (Offset % 64) // Ignore doubles that are not aligned. continue; // Add ((Offset - LastOffset) / 64) args of type i64. for (unsigned j = (Offset - LastOffset) / 64; j > 0; --j) ArgList.push_back(I64); // Add double type. ArgList.push_back(llvm::Type::getDoubleTy(getVMContext())); LastOffset = Offset + 64; } CoerceToIntArgs(TySize - LastOffset, IntArgList); ArgList.append(IntArgList.begin(), IntArgList.end()); return llvm::StructType::get(getVMContext(), ArgList); } llvm::Type *MipsABIInfo::getPaddingType(uint64_t OrigOffset, uint64_t Offset) const { if (OrigOffset + MinABIStackAlignInBytes > Offset) return nullptr; return llvm::IntegerType::get(getVMContext(), (Offset - OrigOffset) * 8); } ABIArgInfo MipsABIInfo::classifyArgumentType(QualType Ty, uint64_t &Offset) const { Ty = useFirstFieldIfTransparentUnion(Ty); uint64_t OrigOffset = Offset; uint64_t TySize = getContext().getTypeSize(Ty); uint64_t Align = getContext().getTypeAlign(Ty) / 8; Align = std::min(std::max(Align, (uint64_t)MinABIStackAlignInBytes), (uint64_t)StackAlignInBytes); unsigned CurrOffset = llvm::RoundUpToAlignment(Offset, Align); Offset = CurrOffset + llvm::RoundUpToAlignment(TySize, Align * 8) / 8; if (isAggregateTypeForABI(Ty) || Ty->isVectorType()) { // Ignore empty aggregates. if (TySize == 0) return ABIArgInfo::getIgnore(); if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) { Offset = OrigOffset + MinABIStackAlignInBytes; return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); } // If we have reached here, aggregates are passed directly by coercing to // another structure type. Padding is inserted if the offset of the // aggregate is unaligned. ABIArgInfo ArgInfo = ABIArgInfo::getDirect(HandleAggregates(Ty, TySize), 0, getPaddingType(OrigOffset, CurrOffset)); ArgInfo.setInReg(true); return ArgInfo; } // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); // All integral types are promoted to the GPR width. if (Ty->isIntegralOrEnumerationType()) return ABIArgInfo::getExtend(); return ABIArgInfo::getDirect( nullptr, 0, IsO32 ? nullptr : getPaddingType(OrigOffset, CurrOffset)); } llvm::Type* MipsABIInfo::returnAggregateInRegs(QualType RetTy, uint64_t Size) const { const RecordType *RT = RetTy->getAs(); SmallVector RTList; if (RT && RT->isStructureOrClassType()) { const RecordDecl *RD = RT->getDecl(); const ASTRecordLayout &Layout = getContext().getASTRecordLayout(RD); unsigned FieldCnt = Layout.getFieldCount(); // N32/64 returns struct/classes in floating point registers if the // following conditions are met: // 1. The size of the struct/class is no larger than 128-bit. // 2. The struct/class has one or two fields all of which are floating // point types. // 3. The offset of the first field is zero (this follows what gcc does). // // Any other composite results are returned in integer registers. // if (FieldCnt && (FieldCnt <= 2) && !Layout.getFieldOffset(0)) { RecordDecl::field_iterator b = RD->field_begin(), e = RD->field_end(); for (; b != e; ++b) { const BuiltinType *BT = b->getType()->getAs(); if (!BT || !BT->isFloatingPoint()) break; RTList.push_back(CGT.ConvertType(b->getType())); } if (b == e) return llvm::StructType::get(getVMContext(), RTList, RD->hasAttr()); RTList.clear(); } } CoerceToIntArgs(Size, RTList); return llvm::StructType::get(getVMContext(), RTList); } ABIArgInfo MipsABIInfo::classifyReturnType(QualType RetTy) const { uint64_t Size = getContext().getTypeSize(RetTy); if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); // O32 doesn't treat zero-sized structs differently from other structs. // However, N32/N64 ignores zero sized return values. if (!IsO32 && Size == 0) return ABIArgInfo::getIgnore(); if (isAggregateTypeForABI(RetTy) || RetTy->isVectorType()) { if (Size <= 128) { if (RetTy->isAnyComplexType()) return ABIArgInfo::getDirect(); // O32 returns integer vectors in registers and N32/N64 returns all small // aggregates in registers. if (!IsO32 || (RetTy->isVectorType() && !RetTy->hasFloatingRepresentation())) { ABIArgInfo ArgInfo = ABIArgInfo::getDirect(returnAggregateInRegs(RetTy, Size)); ArgInfo.setInReg(true); return ArgInfo; } } return ABIArgInfo::getIndirect(0); } // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } void MipsABIInfo::computeInfo(CGFunctionInfo &FI) const { ABIArgInfo &RetInfo = FI.getReturnInfo(); if (!getCXXABI().classifyReturnType(FI)) RetInfo = classifyReturnType(FI.getReturnType()); // Check if a pointer to an aggregate is passed as a hidden argument. uint64_t Offset = RetInfo.isIndirect() ? MinABIStackAlignInBytes : 0; for (auto &I : FI.arguments()) I.info = classifyArgumentType(I.type, Offset); } llvm::Value* MipsABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { llvm::Type *BP = CGF.Int8PtrTy; llvm::Type *BPP = CGF.Int8PtrPtrTy; // Integer arguments are promoted to 32-bit on O32 and 64-bit on N32/N64. // Pointers are also promoted in the same way but this only matters for N32. unsigned SlotSizeInBits = IsO32 ? 32 : 64; unsigned PtrWidth = getTarget().getPointerWidth(0); if ((Ty->isIntegerType() && CGF.getContext().getIntWidth(Ty) < SlotSizeInBits) || (Ty->isPointerType() && PtrWidth < SlotSizeInBits)) { Ty = CGF.getContext().getIntTypeForBitwidth(SlotSizeInBits, Ty->isSignedIntegerType()); } CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); int64_t TypeAlign = std::min(getContext().getTypeAlign(Ty) / 8, StackAlignInBytes); llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *AddrTyped; llvm::IntegerType *IntTy = (PtrWidth == 32) ? CGF.Int32Ty : CGF.Int64Ty; if (TypeAlign > MinABIStackAlignInBytes) { llvm::Value *AddrAsInt = CGF.Builder.CreatePtrToInt(Addr, IntTy); llvm::Value *Inc = llvm::ConstantInt::get(IntTy, TypeAlign - 1); llvm::Value *Mask = llvm::ConstantInt::get(IntTy, -TypeAlign); llvm::Value *Add = CGF.Builder.CreateAdd(AddrAsInt, Inc); llvm::Value *And = CGF.Builder.CreateAnd(Add, Mask); AddrTyped = CGF.Builder.CreateIntToPtr(And, PTy); } else AddrTyped = Builder.CreateBitCast(Addr, PTy); llvm::Value *AlignedAddr = Builder.CreateBitCast(AddrTyped, BP); TypeAlign = std::max((unsigned)TypeAlign, MinABIStackAlignInBytes); unsigned ArgSizeInBits = CGF.getContext().getTypeSize(Ty); uint64_t Offset = llvm::RoundUpToAlignment(ArgSizeInBits / 8, TypeAlign); llvm::Value *NextAddr = Builder.CreateGEP(AlignedAddr, llvm::ConstantInt::get(IntTy, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); return AddrTyped; } bool MIPSTargetCodeGenInfo::initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const { // This information comes from gcc's implementation, which seems to // as canonical as it gets. // Everything on MIPS is 4 bytes. Double-precision FP registers // are aliased to pairs of single-precision FP registers. llvm::Value *Four8 = llvm::ConstantInt::get(CGF.Int8Ty, 4); // 0-31 are the general purpose registers, $0 - $31. // 32-63 are the floating-point registers, $f0 - $f31. // 64 and 65 are the multiply/divide registers, $hi and $lo. // 66 is the (notional, I think) register for signal-handler return. AssignToArrayRange(CGF.Builder, Address, Four8, 0, 65); // 67-74 are the floating-point status registers, $fcc0 - $fcc7. // They are one bit wide and ignored here. // 80-111 are the coprocessor 0 registers, $c0r0 - $c0r31. // (coprocessor 1 is the FP unit) // 112-143 are the coprocessor 2 registers, $c2r0 - $c2r31. // 144-175 are the coprocessor 3 registers, $c3r0 - $c3r31. // 176-181 are the DSP accumulator registers. AssignToArrayRange(CGF.Builder, Address, Four8, 80, 181); return false; } //===----------------------------------------------------------------------===// // TCE ABI Implementation (see http://tce.cs.tut.fi). Uses mostly the defaults. // Currently subclassed only to implement custom OpenCL C function attribute // handling. //===----------------------------------------------------------------------===// namespace { class TCETargetCodeGenInfo : public DefaultTargetCodeGenInfo { public: TCETargetCodeGenInfo(CodeGenTypes &CGT) : DefaultTargetCodeGenInfo(CGT) {} void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; }; void TCETargetCodeGenInfo::SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { const FunctionDecl *FD = dyn_cast(D); if (!FD) return; llvm::Function *F = cast(GV); if (M.getLangOpts().OpenCL) { if (FD->hasAttr()) { // OpenCL C Kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); const ReqdWorkGroupSizeAttr *Attr = FD->getAttr(); if (Attr) { // Convert the reqd_work_group_size() attributes to metadata. llvm::LLVMContext &Context = F->getContext(); llvm::NamedMDNode *OpenCLMetadata = M.getModule().getOrInsertNamedMetadata("opencl.kernel_wg_size_info"); SmallVector Operands; Operands.push_back(llvm::ConstantAsMetadata::get(F)); Operands.push_back( llvm::ConstantAsMetadata::get(llvm::Constant::getIntegerValue( M.Int32Ty, llvm::APInt(32, Attr->getXDim())))); Operands.push_back( llvm::ConstantAsMetadata::get(llvm::Constant::getIntegerValue( M.Int32Ty, llvm::APInt(32, Attr->getYDim())))); Operands.push_back( llvm::ConstantAsMetadata::get(llvm::Constant::getIntegerValue( M.Int32Ty, llvm::APInt(32, Attr->getZDim())))); // Add a boolean constant operand for "required" (true) or "hint" (false) // for implementing the work_group_size_hint attr later. Currently // always true as the hint is not yet implemented. Operands.push_back( llvm::ConstantAsMetadata::get(llvm::ConstantInt::getTrue(Context))); OpenCLMetadata->addOperand(llvm::MDNode::get(Context, Operands)); } } } } } //===----------------------------------------------------------------------===// // Hexagon ABI Implementation //===----------------------------------------------------------------------===// namespace { class HexagonABIInfo : public ABIInfo { public: HexagonABIInfo(CodeGenTypes &CGT) : ABIInfo(CGT) {} private: ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyArgumentType(QualType RetTy) const; void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class HexagonTargetCodeGenInfo : public TargetCodeGenInfo { public: HexagonTargetCodeGenInfo(CodeGenTypes &CGT) :TargetCodeGenInfo(new HexagonABIInfo(CGT)) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override { return 29; } }; } void HexagonABIInfo::computeInfo(CGFunctionInfo &FI) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &I : FI.arguments()) I.info = classifyArgumentType(I.type); } ABIArgInfo HexagonABIInfo::classifyArgumentType(QualType Ty) const { if (!isAggregateTypeForABI(Ty)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } // Ignore empty records. if (isEmptyRecord(getContext(), Ty, true)) return ABIArgInfo::getIgnore(); if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); uint64_t Size = getContext().getTypeSize(Ty); if (Size > 64) return ABIArgInfo::getIndirect(0, /*ByVal=*/true); // Pass in the smallest viable integer type. else if (Size > 32) return ABIArgInfo::getDirect(llvm::Type::getInt64Ty(getVMContext())); else if (Size > 16) return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); else if (Size > 8) return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); else return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext())); } ABIArgInfo HexagonABIInfo::classifyReturnType(QualType RetTy) const { if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); // Large vector types should be returned via memory. if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 64) return ABIArgInfo::getIndirect(0); if (!isAggregateTypeForABI(RetTy)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = RetTy->getAs()) RetTy = EnumTy->getDecl()->getIntegerType(); return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend() : ABIArgInfo::getDirect()); } if (isEmptyRecord(getContext(), RetTy, true)) return ABIArgInfo::getIgnore(); // Aggregates <= 8 bytes are returned in r0; other aggregates // are returned indirectly. uint64_t Size = getContext().getTypeSize(RetTy); if (Size <= 64) { // Return in the smallest viable integer type. if (Size <= 8) return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext())); if (Size <= 16) return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); if (Size <= 32) return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); return ABIArgInfo::getDirect(llvm::Type::getInt64Ty(getVMContext())); } return ABIArgInfo::getIndirect(0, /*ByVal=*/true); } llvm::Value *HexagonABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { // FIXME: Need to handle alignment llvm::Type *BPP = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty)); llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy); uint64_t Offset = llvm::RoundUpToAlignment(CGF.getContext().getTypeSize(Ty) / 8, 4); llvm::Value *NextAddr = Builder.CreateGEP(Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset), "ap.next"); Builder.CreateStore(NextAddr, VAListAddrAsBPP); return AddrTyped; } //===----------------------------------------------------------------------===// // AMDGPU ABI Implementation //===----------------------------------------------------------------------===// namespace { class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { public: AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {} void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; }; } void AMDGPUTargetCodeGenInfo::SetTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { const FunctionDecl *FD = dyn_cast(D); if (!FD) return; if (const auto Attr = FD->getAttr()) { llvm::Function *F = cast(GV); uint32_t NumVGPR = Attr->getNumVGPR(); if (NumVGPR != 0) F->addFnAttr("amdgpu_num_vgpr", llvm::utostr(NumVGPR)); } if (const auto Attr = FD->getAttr()) { llvm::Function *F = cast(GV); unsigned NumSGPR = Attr->getNumSGPR(); if (NumSGPR != 0) F->addFnAttr("amdgpu_num_sgpr", llvm::utostr(NumSGPR)); } } //===----------------------------------------------------------------------===// // SPARC v9 ABI Implementation. // Based on the SPARC Compliance Definition version 2.4.1. // // Function arguments a mapped to a nominal "parameter array" and promoted to // registers depending on their type. Each argument occupies 8 or 16 bytes in // the array, structs larger than 16 bytes are passed indirectly. // // One case requires special care: // // struct mixed { // int i; // float f; // }; // // When a struct mixed is passed by value, it only occupies 8 bytes in the // parameter array, but the int is passed in an integer register, and the float // is passed in a floating point register. This is represented as two arguments // with the LLVM IR inreg attribute: // // declare void f(i32 inreg %i, float inreg %f) // // The code generator will only allocate 4 bytes from the parameter array for // the inreg arguments. All other arguments are allocated a multiple of 8 // bytes. // namespace { class SparcV9ABIInfo : public ABIInfo { public: SparcV9ABIInfo(CodeGenTypes &CGT) : ABIInfo(CGT) {} private: ABIArgInfo classifyType(QualType RetTy, unsigned SizeLimit) const; void computeInfo(CGFunctionInfo &FI) const override; llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; // Coercion type builder for structs passed in registers. The coercion type // serves two purposes: // // 1. Pad structs to a multiple of 64 bits, so they are passed 'left-aligned' // in registers. // 2. Expose aligned floating point elements as first-level elements, so the // code generator knows to pass them in floating point registers. // // We also compute the InReg flag which indicates that the struct contains // aligned 32-bit floats. // struct CoerceBuilder { llvm::LLVMContext &Context; const llvm::DataLayout &DL; SmallVector Elems; uint64_t Size; bool InReg; CoerceBuilder(llvm::LLVMContext &c, const llvm::DataLayout &dl) : Context(c), DL(dl), Size(0), InReg(false) {} // Pad Elems with integers until Size is ToSize. void pad(uint64_t ToSize) { assert(ToSize >= Size && "Cannot remove elements"); if (ToSize == Size) return; // Finish the current 64-bit word. uint64_t Aligned = llvm::RoundUpToAlignment(Size, 64); if (Aligned > Size && Aligned <= ToSize) { Elems.push_back(llvm::IntegerType::get(Context, Aligned - Size)); Size = Aligned; } // Add whole 64-bit words. while (Size + 64 <= ToSize) { Elems.push_back(llvm::Type::getInt64Ty(Context)); Size += 64; } // Final in-word padding. if (Size < ToSize) { Elems.push_back(llvm::IntegerType::get(Context, ToSize - Size)); Size = ToSize; } } // Add a floating point element at Offset. void addFloat(uint64_t Offset, llvm::Type *Ty, unsigned Bits) { // Unaligned floats are treated as integers. if (Offset % Bits) return; // The InReg flag is only required if there are any floats < 64 bits. if (Bits < 64) InReg = true; pad(Offset); Elems.push_back(Ty); Size = Offset + Bits; } // Add a struct type to the coercion type, starting at Offset (in bits). void addStruct(uint64_t Offset, llvm::StructType *StrTy) { const llvm::StructLayout *Layout = DL.getStructLayout(StrTy); for (unsigned i = 0, e = StrTy->getNumElements(); i != e; ++i) { llvm::Type *ElemTy = StrTy->getElementType(i); uint64_t ElemOffset = Offset + Layout->getElementOffsetInBits(i); switch (ElemTy->getTypeID()) { case llvm::Type::StructTyID: addStruct(ElemOffset, cast(ElemTy)); break; case llvm::Type::FloatTyID: addFloat(ElemOffset, ElemTy, 32); break; case llvm::Type::DoubleTyID: addFloat(ElemOffset, ElemTy, 64); break; case llvm::Type::FP128TyID: addFloat(ElemOffset, ElemTy, 128); break; case llvm::Type::PointerTyID: if (ElemOffset % 64 == 0) { pad(ElemOffset); Elems.push_back(ElemTy); Size += 64; } break; default: break; } } } // Check if Ty is a usable substitute for the coercion type. bool isUsableType(llvm::StructType *Ty) const { if (Ty->getNumElements() != Elems.size()) return false; for (unsigned i = 0, e = Elems.size(); i != e; ++i) if (Elems[i] != Ty->getElementType(i)) return false; return true; } // Get the coercion type as a literal struct type. llvm::Type *getType() const { if (Elems.size() == 1) return Elems.front(); else return llvm::StructType::get(Context, Elems); } }; }; } // end anonymous namespace ABIArgInfo SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const { if (Ty->isVoidType()) return ABIArgInfo::getIgnore(); uint64_t Size = getContext().getTypeSize(Ty); // Anything too big to fit in registers is passed with an explicit indirect // pointer / sret pointer. if (Size > SizeLimit) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); // Integer types smaller than a register are extended. if (Size < 64 && Ty->isIntegerType()) return ABIArgInfo::getExtend(); // Other non-aggregates go in registers. if (!isAggregateTypeForABI(Ty)) return ABIArgInfo::getDirect(); // If a C++ object has either a non-trivial copy constructor or a non-trivial // destructor, it is passed with an explicit indirect pointer / sret pointer. if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory); // This is a small aggregate type that should be passed in registers. // Build a coercion type from the LLVM struct type. llvm::StructType *StrTy = dyn_cast(CGT.ConvertType(Ty)); if (!StrTy) return ABIArgInfo::getDirect(); CoerceBuilder CB(getVMContext(), getDataLayout()); CB.addStruct(0, StrTy); CB.pad(llvm::RoundUpToAlignment(CB.DL.getTypeSizeInBits(StrTy), 64)); // Try to use the original type for coercion. llvm::Type *CoerceTy = CB.isUsableType(StrTy) ? StrTy : CB.getType(); if (CB.InReg) return ABIArgInfo::getDirectInReg(CoerceTy); else return ABIArgInfo::getDirect(CoerceTy); } llvm::Value *SparcV9ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { ABIArgInfo AI = classifyType(Ty, 16 * 8); llvm::Type *ArgTy = CGT.ConvertType(Ty); if (AI.canHaveCoerceToType() && !AI.getCoerceToType()) AI.setCoerceToType(ArgTy); llvm::Type *BPP = CGF.Int8PtrPtrTy; CGBuilderTy &Builder = CGF.Builder; llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap"); llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur"); llvm::Type *ArgPtrTy = llvm::PointerType::getUnqual(ArgTy); llvm::Value *ArgAddr; unsigned Stride; switch (AI.getKind()) { case ABIArgInfo::Expand: case ABIArgInfo::InAlloca: llvm_unreachable("Unsupported ABI kind for va_arg"); case ABIArgInfo::Extend: Stride = 8; ArgAddr = Builder .CreateConstGEP1_32(Addr, 8 - getDataLayout().getTypeAllocSize(ArgTy), "extend"); break; case ABIArgInfo::Direct: Stride = getDataLayout().getTypeAllocSize(AI.getCoerceToType()); ArgAddr = Addr; break; case ABIArgInfo::Indirect: Stride = 8; ArgAddr = Builder.CreateBitCast(Addr, llvm::PointerType::getUnqual(ArgPtrTy), "indirect"); ArgAddr = Builder.CreateLoad(ArgAddr, "indirect.arg"); break; case ABIArgInfo::Ignore: return llvm::UndefValue::get(ArgPtrTy); } // Update VAList. Addr = Builder.CreateConstGEP1_32(Addr, Stride, "ap.next"); Builder.CreateStore(Addr, VAListAddrAsBPP); return Builder.CreatePointerCast(ArgAddr, ArgPtrTy, "arg.addr"); } void SparcV9ABIInfo::computeInfo(CGFunctionInfo &FI) const { FI.getReturnInfo() = classifyType(FI.getReturnType(), 32 * 8); for (auto &I : FI.arguments()) I.info = classifyType(I.type, 16 * 8); } namespace { class SparcV9TargetCodeGenInfo : public TargetCodeGenInfo { public: SparcV9TargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(new SparcV9ABIInfo(CGT)) {} int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override { return 14; } bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const override; }; } // end anonymous namespace bool SparcV9TargetCodeGenInfo::initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF, llvm::Value *Address) const { // This is calculated from the LLVM and GCC tables and verified // against gcc output. AFAIK all ABIs use the same encoding. CodeGen::CGBuilderTy &Builder = CGF.Builder; llvm::IntegerType *i8 = CGF.Int8Ty; llvm::Value *Four8 = llvm::ConstantInt::get(i8, 4); llvm::Value *Eight8 = llvm::ConstantInt::get(i8, 8); // 0-31: the 8-byte general-purpose registers AssignToArrayRange(Builder, Address, Eight8, 0, 31); // 32-63: f0-31, the 4-byte floating-point registers AssignToArrayRange(Builder, Address, Four8, 32, 63); // Y = 64 // PSR = 65 // WIM = 66 // TBR = 67 // PC = 68 // NPC = 69 // FSR = 70 // CSR = 71 AssignToArrayRange(Builder, Address, Eight8, 64, 71); // 72-87: d0-15, the 8-byte floating-point registers AssignToArrayRange(Builder, Address, Eight8, 72, 87); return false; } //===----------------------------------------------------------------------===// // XCore ABI Implementation //===----------------------------------------------------------------------===// namespace { /// A SmallStringEnc instance is used to build up the TypeString by passing /// it by reference between functions that append to it. typedef llvm::SmallString<128> SmallStringEnc; /// TypeStringCache caches the meta encodings of Types. /// /// The reason for caching TypeStrings is two fold: /// 1. To cache a type's encoding for later uses; /// 2. As a means to break recursive member type inclusion. /// /// A cache Entry can have a Status of: /// NonRecursive: The type encoding is not recursive; /// Recursive: The type encoding is recursive; /// Incomplete: An incomplete TypeString; /// IncompleteUsed: An incomplete TypeString that has been used in a /// Recursive type encoding. /// /// A NonRecursive entry will have all of its sub-members expanded as fully /// as possible. Whilst it may contain types which are recursive, the type /// itself is not recursive and thus its encoding may be safely used whenever /// the type is encountered. /// /// A Recursive entry will have all of its sub-members expanded as fully as /// possible. The type itself is recursive and it may contain other types which /// are recursive. The Recursive encoding must not be used during the expansion /// of a recursive type's recursive branch. For simplicity the code uses /// IncompleteCount to reject all usage of Recursive encodings for member types. /// /// An Incomplete entry is always a RecordType and only encodes its /// identifier e.g. "s(S){}". Incomplete 'StubEnc' entries are ephemeral and /// are placed into the cache during type expansion as a means to identify and /// handle recursive inclusion of types as sub-members. If there is recursion /// the entry becomes IncompleteUsed. /// /// During the expansion of a RecordType's members: /// /// If the cache contains a NonRecursive encoding for the member type, the /// cached encoding is used; /// /// If the cache contains a Recursive encoding for the member type, the /// cached encoding is 'Swapped' out, as it may be incorrect, and... /// /// If the member is a RecordType, an Incomplete encoding is placed into the /// cache to break potential recursive inclusion of itself as a sub-member; /// /// Once a member RecordType has been expanded, its temporary incomplete /// entry is removed from the cache. If a Recursive encoding was swapped out /// it is swapped back in; /// /// If an incomplete entry is used to expand a sub-member, the incomplete /// entry is marked as IncompleteUsed. The cache keeps count of how many /// IncompleteUsed entries it currently contains in IncompleteUsedCount; /// /// If a member's encoding is found to be a NonRecursive or Recursive viz: /// IncompleteUsedCount==0, the member's encoding is added to the cache. /// Else the member is part of a recursive type and thus the recursion has /// been exited too soon for the encoding to be correct for the member. /// class TypeStringCache { enum Status {NonRecursive, Recursive, Incomplete, IncompleteUsed}; struct Entry { std::string Str; // The encoded TypeString for the type. enum Status State; // Information about the encoding in 'Str'. std::string Swapped; // A temporary place holder for a Recursive encoding // during the expansion of RecordType's members. }; std::map Map; unsigned IncompleteCount; // Number of Incomplete entries in the Map. unsigned IncompleteUsedCount; // Number of IncompleteUsed entries in the Map. public: TypeStringCache() : IncompleteCount(0), IncompleteUsedCount(0) {}; void addIncomplete(const IdentifierInfo *ID, std::string StubEnc); bool removeIncomplete(const IdentifierInfo *ID); void addIfComplete(const IdentifierInfo *ID, StringRef Str, bool IsRecursive); StringRef lookupStr(const IdentifierInfo *ID); }; /// TypeString encodings for enum & union fields must be order. /// FieldEncoding is a helper for this ordering process. class FieldEncoding { bool HasName; std::string Enc; public: FieldEncoding(bool b, SmallStringEnc &e) : HasName(b), Enc(e.c_str()) {}; StringRef str() {return Enc.c_str();}; bool operator<(const FieldEncoding &rhs) const { if (HasName != rhs.HasName) return HasName; return Enc < rhs.Enc; } }; class XCoreABIInfo : public DefaultABIInfo { public: XCoreABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {} llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; }; class XCoreTargetCodeGenInfo : public TargetCodeGenInfo { mutable TypeStringCache TSC; public: XCoreTargetCodeGenInfo(CodeGenTypes &CGT) :TargetCodeGenInfo(new XCoreABIInfo(CGT)) {} void emitTargetMD(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; }; } // End anonymous namespace. llvm::Value *XCoreABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const { CGBuilderTy &Builder = CGF.Builder; // Get the VAList. llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, CGF.Int8PtrPtrTy); llvm::Value *AP = Builder.CreateLoad(VAListAddrAsBPP); // Handle the argument. ABIArgInfo AI = classifyArgumentType(Ty); llvm::Type *ArgTy = CGT.ConvertType(Ty); if (AI.canHaveCoerceToType() && !AI.getCoerceToType()) AI.setCoerceToType(ArgTy); llvm::Type *ArgPtrTy = llvm::PointerType::getUnqual(ArgTy); llvm::Value *Val; uint64_t ArgSize = 0; switch (AI.getKind()) { case ABIArgInfo::Expand: case ABIArgInfo::InAlloca: llvm_unreachable("Unsupported ABI kind for va_arg"); case ABIArgInfo::Ignore: Val = llvm::UndefValue::get(ArgPtrTy); ArgSize = 0; break; case ABIArgInfo::Extend: case ABIArgInfo::Direct: Val = Builder.CreatePointerCast(AP, ArgPtrTy); ArgSize = getDataLayout().getTypeAllocSize(AI.getCoerceToType()); if (ArgSize < 4) ArgSize = 4; break; case ABIArgInfo::Indirect: llvm::Value *ArgAddr; ArgAddr = Builder.CreateBitCast(AP, llvm::PointerType::getUnqual(ArgPtrTy)); ArgAddr = Builder.CreateLoad(ArgAddr); Val = Builder.CreatePointerCast(ArgAddr, ArgPtrTy); ArgSize = 4; break; } // Increment the VAList. if (ArgSize) { llvm::Value *APN = Builder.CreateConstGEP1_32(AP, ArgSize); Builder.CreateStore(APN, VAListAddrAsBPP); } return Val; } /// During the expansion of a RecordType, an incomplete TypeString is placed /// into the cache as a means to identify and break recursion. /// If there is a Recursive encoding in the cache, it is swapped out and will /// be reinserted by removeIncomplete(). /// All other types of encoding should have been used rather than arriving here. void TypeStringCache::addIncomplete(const IdentifierInfo *ID, std::string StubEnc) { if (!ID) return; Entry &E = Map[ID]; assert( (E.Str.empty() || E.State == Recursive) && "Incorrectly use of addIncomplete"); assert(!StubEnc.empty() && "Passing an empty string to addIncomplete()"); E.Swapped.swap(E.Str); // swap out the Recursive E.Str.swap(StubEnc); E.State = Incomplete; ++IncompleteCount; } /// Once the RecordType has been expanded, the temporary incomplete TypeString /// must be removed from the cache. /// If a Recursive was swapped out by addIncomplete(), it will be replaced. /// Returns true if the RecordType was defined recursively. bool TypeStringCache::removeIncomplete(const IdentifierInfo *ID) { if (!ID) return false; auto I = Map.find(ID); assert(I != Map.end() && "Entry not present"); Entry &E = I->second; assert( (E.State == Incomplete || E.State == IncompleteUsed) && "Entry must be an incomplete type"); bool IsRecursive = false; if (E.State == IncompleteUsed) { // We made use of our Incomplete encoding, thus we are recursive. IsRecursive = true; --IncompleteUsedCount; } if (E.Swapped.empty()) Map.erase(I); else { // Swap the Recursive back. E.Swapped.swap(E.Str); E.Swapped.clear(); E.State = Recursive; } --IncompleteCount; return IsRecursive; } /// Add the encoded TypeString to the cache only if it is NonRecursive or /// Recursive (viz: all sub-members were expanded as fully as possible). void TypeStringCache::addIfComplete(const IdentifierInfo *ID, StringRef Str, bool IsRecursive) { if (!ID || IncompleteUsedCount) return; // No key or it is is an incomplete sub-type so don't add. Entry &E = Map[ID]; if (IsRecursive && !E.Str.empty()) { assert(E.State==Recursive && E.Str.size() == Str.size() && "This is not the same Recursive entry"); // The parent container was not recursive after all, so we could have used // this Recursive sub-member entry after all, but we assumed the worse when // we started viz: IncompleteCount!=0. return; } assert(E.Str.empty() && "Entry already present"); E.Str = Str.str(); E.State = IsRecursive? Recursive : NonRecursive; } /// Return a cached TypeString encoding for the ID. If there isn't one, or we /// are recursively expanding a type (IncompleteCount != 0) and the cached /// encoding is Recursive, return an empty StringRef. StringRef TypeStringCache::lookupStr(const IdentifierInfo *ID) { if (!ID) return StringRef(); // We have no key. auto I = Map.find(ID); if (I == Map.end()) return StringRef(); // We have no encoding. Entry &E = I->second; if (E.State == Recursive && IncompleteCount) return StringRef(); // We don't use Recursive encodings for member types. if (E.State == Incomplete) { // The incomplete type is being used to break out of recursion. E.State = IncompleteUsed; ++IncompleteUsedCount; } return E.Str.c_str(); } /// The XCore ABI includes a type information section that communicates symbol /// type information to the linker. The linker uses this information to verify /// safety/correctness of things such as array bound and pointers et al. /// The ABI only requires C (and XC) language modules to emit TypeStrings. /// This type information (TypeString) is emitted into meta data for all global /// symbols: definitions, declarations, functions & variables. /// /// The TypeString carries type, qualifier, name, size & value details. /// Please see 'Tools Development Guide' section 2.16.2 for format details: /// /// The output is tested by test/CodeGen/xcore-stringtype.c. /// static bool getTypeString(SmallStringEnc &Enc, const Decl *D, CodeGen::CodeGenModule &CGM, TypeStringCache &TSC); /// XCore uses emitTargetMD to emit TypeString metadata for global symbols. void XCoreTargetCodeGenInfo::emitTargetMD(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &CGM) const { SmallStringEnc Enc; if (getTypeString(Enc, D, CGM, TSC)) { llvm::LLVMContext &Ctx = CGM.getModule().getContext(); llvm::SmallVector MDVals; MDVals.push_back(llvm::ConstantAsMetadata::get(GV)); MDVals.push_back(llvm::MDString::get(Ctx, Enc.str())); llvm::NamedMDNode *MD = CGM.getModule().getOrInsertNamedMetadata("xcore.typestrings"); MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } } static bool appendType(SmallStringEnc &Enc, QualType QType, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC); /// Helper function for appendRecordType(). /// Builds a SmallVector containing the encoded field types in declaration order. static bool extractFieldType(SmallVectorImpl &FE, const RecordDecl *RD, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC) { for (const auto *Field : RD->fields()) { SmallStringEnc Enc; Enc += "m("; Enc += Field->getName(); Enc += "){"; if (Field->isBitField()) { Enc += "b("; llvm::raw_svector_ostream OS(Enc); OS.resync(); OS << Field->getBitWidthValue(CGM.getContext()); OS.flush(); Enc += ':'; } if (!appendType(Enc, Field->getType(), CGM, TSC)) return false; if (Field->isBitField()) Enc += ')'; Enc += '}'; FE.push_back(FieldEncoding(!Field->getName().empty(), Enc)); } return true; } /// Appends structure and union types to Enc and adds encoding to cache. /// Recursively calls appendType (via extractFieldType) for each field. /// Union types have their fields ordered according to the ABI. static bool appendRecordType(SmallStringEnc &Enc, const RecordType *RT, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC, const IdentifierInfo *ID) { // Append the cached TypeString if we have one. StringRef TypeString = TSC.lookupStr(ID); if (!TypeString.empty()) { Enc += TypeString; return true; } // Start to emit an incomplete TypeString. size_t Start = Enc.size(); Enc += (RT->isUnionType()? 'u' : 's'); Enc += '('; if (ID) Enc += ID->getName(); Enc += "){"; // We collect all encoded fields and order as necessary. bool IsRecursive = false; const RecordDecl *RD = RT->getDecl()->getDefinition(); if (RD && !RD->field_empty()) { // An incomplete TypeString stub is placed in the cache for this RecordType // so that recursive calls to this RecordType will use it whilst building a // complete TypeString for this RecordType. SmallVector FE; std::string StubEnc(Enc.substr(Start).str()); StubEnc += '}'; // StubEnc now holds a valid incomplete TypeString. TSC.addIncomplete(ID, std::move(StubEnc)); if (!extractFieldType(FE, RD, CGM, TSC)) { (void) TSC.removeIncomplete(ID); return false; } IsRecursive = TSC.removeIncomplete(ID); // The ABI requires unions to be sorted but not structures. // See FieldEncoding::operator< for sort algorithm. if (RT->isUnionType()) std::sort(FE.begin(), FE.end()); // We can now complete the TypeString. unsigned E = FE.size(); for (unsigned I = 0; I != E; ++I) { if (I) Enc += ','; Enc += FE[I].str(); } } Enc += '}'; TSC.addIfComplete(ID, Enc.substr(Start), IsRecursive); return true; } /// Appends enum types to Enc and adds the encoding to the cache. static bool appendEnumType(SmallStringEnc &Enc, const EnumType *ET, TypeStringCache &TSC, const IdentifierInfo *ID) { // Append the cached TypeString if we have one. StringRef TypeString = TSC.lookupStr(ID); if (!TypeString.empty()) { Enc += TypeString; return true; } size_t Start = Enc.size(); Enc += "e("; if (ID) Enc += ID->getName(); Enc += "){"; // We collect all encoded enumerations and order them alphanumerically. if (const EnumDecl *ED = ET->getDecl()->getDefinition()) { SmallVector FE; for (auto I = ED->enumerator_begin(), E = ED->enumerator_end(); I != E; ++I) { SmallStringEnc EnumEnc; EnumEnc += "m("; EnumEnc += I->getName(); EnumEnc += "){"; I->getInitVal().toString(EnumEnc); EnumEnc += '}'; FE.push_back(FieldEncoding(!I->getName().empty(), EnumEnc)); } std::sort(FE.begin(), FE.end()); unsigned E = FE.size(); for (unsigned I = 0; I != E; ++I) { if (I) Enc += ','; Enc += FE[I].str(); } } Enc += '}'; TSC.addIfComplete(ID, Enc.substr(Start), false); return true; } /// Appends type's qualifier to Enc. /// This is done prior to appending the type's encoding. static void appendQualifier(SmallStringEnc &Enc, QualType QT) { // Qualifiers are emitted in alphabetical order. static const char *Table[] = {"","c:","r:","cr:","v:","cv:","rv:","crv:"}; int Lookup = 0; if (QT.isConstQualified()) Lookup += 1<<0; if (QT.isRestrictQualified()) Lookup += 1<<1; if (QT.isVolatileQualified()) Lookup += 1<<2; Enc += Table[Lookup]; } /// Appends built-in types to Enc. static bool appendBuiltinType(SmallStringEnc &Enc, const BuiltinType *BT) { const char *EncType; switch (BT->getKind()) { case BuiltinType::Void: EncType = "0"; break; case BuiltinType::Bool: EncType = "b"; break; case BuiltinType::Char_U: EncType = "uc"; break; case BuiltinType::UChar: EncType = "uc"; break; case BuiltinType::SChar: EncType = "sc"; break; case BuiltinType::UShort: EncType = "us"; break; case BuiltinType::Short: EncType = "ss"; break; case BuiltinType::UInt: EncType = "ui"; break; case BuiltinType::Int: EncType = "si"; break; case BuiltinType::ULong: EncType = "ul"; break; case BuiltinType::Long: EncType = "sl"; break; case BuiltinType::ULongLong: EncType = "ull"; break; case BuiltinType::LongLong: EncType = "sll"; break; case BuiltinType::Float: EncType = "ft"; break; case BuiltinType::Double: EncType = "d"; break; case BuiltinType::LongDouble: EncType = "ld"; break; default: return false; } Enc += EncType; return true; } /// Appends a pointer encoding to Enc before calling appendType for the pointee. static bool appendPointerType(SmallStringEnc &Enc, const PointerType *PT, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC) { Enc += "p("; if (!appendType(Enc, PT->getPointeeType(), CGM, TSC)) return false; Enc += ')'; return true; } /// Appends array encoding to Enc before calling appendType for the element. static bool appendArrayType(SmallStringEnc &Enc, QualType QT, const ArrayType *AT, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC, StringRef NoSizeEnc) { if (AT->getSizeModifier() != ArrayType::Normal) return false; Enc += "a("; if (const ConstantArrayType *CAT = dyn_cast(AT)) CAT->getSize().toStringUnsigned(Enc); else Enc += NoSizeEnc; // Global arrays use "*", otherwise it is "". Enc += ':'; // The Qualifiers should be attached to the type rather than the array. appendQualifier(Enc, QT); if (!appendType(Enc, AT->getElementType(), CGM, TSC)) return false; Enc += ')'; return true; } /// Appends a function encoding to Enc, calling appendType for the return type /// and the arguments. static bool appendFunctionType(SmallStringEnc &Enc, const FunctionType *FT, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC) { Enc += "f{"; if (!appendType(Enc, FT->getReturnType(), CGM, TSC)) return false; Enc += "}("; if (const FunctionProtoType *FPT = FT->getAs()) { // N.B. we are only interested in the adjusted param types. auto I = FPT->param_type_begin(); auto E = FPT->param_type_end(); if (I != E) { do { if (!appendType(Enc, *I, CGM, TSC)) return false; ++I; if (I != E) Enc += ','; } while (I != E); if (FPT->isVariadic()) Enc += ",va"; } else { if (FPT->isVariadic()) Enc += "va"; else Enc += '0'; } } Enc += ')'; return true; } /// Handles the type's qualifier before dispatching a call to handle specific /// type encodings. static bool appendType(SmallStringEnc &Enc, QualType QType, const CodeGen::CodeGenModule &CGM, TypeStringCache &TSC) { QualType QT = QType.getCanonicalType(); if (const ArrayType *AT = QT->getAsArrayTypeUnsafe()) // The Qualifiers should be attached to the type rather than the array. // Thus we don't call appendQualifier() here. return appendArrayType(Enc, QT, AT, CGM, TSC, ""); appendQualifier(Enc, QT); if (const BuiltinType *BT = QT->getAs()) return appendBuiltinType(Enc, BT); if (const PointerType *PT = QT->getAs()) return appendPointerType(Enc, PT, CGM, TSC); if (const EnumType *ET = QT->getAs()) return appendEnumType(Enc, ET, TSC, QT.getBaseTypeIdentifier()); if (const RecordType *RT = QT->getAsStructureType()) return appendRecordType(Enc, RT, CGM, TSC, QT.getBaseTypeIdentifier()); if (const RecordType *RT = QT->getAsUnionType()) return appendRecordType(Enc, RT, CGM, TSC, QT.getBaseTypeIdentifier()); if (const FunctionType *FT = QT->getAs()) return appendFunctionType(Enc, FT, CGM, TSC); return false; } static bool getTypeString(SmallStringEnc &Enc, const Decl *D, CodeGen::CodeGenModule &CGM, TypeStringCache &TSC) { if (!D) return false; if (const FunctionDecl *FD = dyn_cast(D)) { if (FD->getLanguageLinkage() != CLanguageLinkage) return false; return appendType(Enc, FD->getType(), CGM, TSC); } if (const VarDecl *VD = dyn_cast(D)) { if (VD->getLanguageLinkage() != CLanguageLinkage) return false; QualType QT = VD->getType().getCanonicalType(); if (const ArrayType *AT = QT->getAsArrayTypeUnsafe()) { // Global ArrayTypes are given a size of '*' if the size is unknown. // The Qualifiers should be attached to the type rather than the array. // Thus we don't call appendQualifier() here. return appendArrayType(Enc, QT, AT, CGM, TSC, "*"); } return appendType(Enc, QT, CGM, TSC); } return false; } //===----------------------------------------------------------------------===// // Driver code //===----------------------------------------------------------------------===// const llvm::Triple &CodeGenModule::getTriple() const { return getTarget().getTriple(); } bool CodeGenModule::supportsCOMDAT() const { return !getTriple().isOSBinFormatMachO(); } const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() { if (TheTargetCodeGenInfo) return *TheTargetCodeGenInfo; const llvm::Triple &Triple = getTarget().getTriple(); switch (Triple.getArch()) { default: return *(TheTargetCodeGenInfo = new DefaultTargetCodeGenInfo(Types)); case llvm::Triple::le32: return *(TheTargetCodeGenInfo = new PNaClTargetCodeGenInfo(Types)); case llvm::Triple::mips: case llvm::Triple::mipsel: return *(TheTargetCodeGenInfo = new MIPSTargetCodeGenInfo(Types, true)); case llvm::Triple::mips64: case llvm::Triple::mips64el: return *(TheTargetCodeGenInfo = new MIPSTargetCodeGenInfo(Types, false)); case llvm::Triple::aarch64: case llvm::Triple::aarch64_be: { AArch64ABIInfo::ABIKind Kind = AArch64ABIInfo::AAPCS; if (getTarget().getABI() == "darwinpcs") Kind = AArch64ABIInfo::DarwinPCS; return *(TheTargetCodeGenInfo = new AArch64TargetCodeGenInfo(Types, Kind)); } case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: { ARMABIInfo::ABIKind Kind = ARMABIInfo::AAPCS; if (getTarget().getABI() == "apcs-gnu") Kind = ARMABIInfo::APCS; else if (CodeGenOpts.FloatABI == "hard" || (CodeGenOpts.FloatABI != "soft" && Triple.getEnvironment() == llvm::Triple::GNUEABIHF)) Kind = ARMABIInfo::AAPCS_VFP; switch (Triple.getOS()) { case llvm::Triple::NaCl: return *(TheTargetCodeGenInfo = new NaClARMTargetCodeGenInfo(Types, Kind)); default: return *(TheTargetCodeGenInfo = new ARMTargetCodeGenInfo(Types, Kind)); } } case llvm::Triple::ppc: return *(TheTargetCodeGenInfo = new PPC32TargetCodeGenInfo(Types)); case llvm::Triple::ppc64: if (Triple.isOSBinFormatELF()) { PPC64_SVR4_ABIInfo::ABIKind Kind = PPC64_SVR4_ABIInfo::ELFv1; if (getTarget().getABI() == "elfv2") Kind = PPC64_SVR4_ABIInfo::ELFv2; return *(TheTargetCodeGenInfo = new PPC64_SVR4_TargetCodeGenInfo(Types, Kind)); } else return *(TheTargetCodeGenInfo = new PPC64TargetCodeGenInfo(Types)); case llvm::Triple::ppc64le: { assert(Triple.isOSBinFormatELF() && "PPC64 LE non-ELF not supported!"); PPC64_SVR4_ABIInfo::ABIKind Kind = PPC64_SVR4_ABIInfo::ELFv2; if (getTarget().getABI() == "elfv1") Kind = PPC64_SVR4_ABIInfo::ELFv1; return *(TheTargetCodeGenInfo = new PPC64_SVR4_TargetCodeGenInfo(Types, Kind)); } case llvm::Triple::nvptx: case llvm::Triple::nvptx64: return *(TheTargetCodeGenInfo = new NVPTXTargetCodeGenInfo(Types)); case llvm::Triple::msp430: return *(TheTargetCodeGenInfo = new MSP430TargetCodeGenInfo(Types)); case llvm::Triple::systemz: return *(TheTargetCodeGenInfo = new SystemZTargetCodeGenInfo(Types)); case llvm::Triple::tce: return *(TheTargetCodeGenInfo = new TCETargetCodeGenInfo(Types)); case llvm::Triple::x86: { bool IsDarwinVectorABI = Triple.isOSDarwin(); bool IsSmallStructInRegABI = X86_32TargetCodeGenInfo::isStructReturnInRegABI(Triple, CodeGenOpts); bool IsWin32FloatStructABI = Triple.isOSWindows() && !Triple.isOSCygMing(); if (Triple.getOS() == llvm::Triple::Win32) { return *(TheTargetCodeGenInfo = new WinX86_32TargetCodeGenInfo(Types, IsDarwinVectorABI, IsSmallStructInRegABI, IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters)); } else { return *(TheTargetCodeGenInfo = new X86_32TargetCodeGenInfo(Types, IsDarwinVectorABI, IsSmallStructInRegABI, IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters)); } } case llvm::Triple::x86_64: { bool HasAVX = getTarget().getABI() == "avx"; switch (Triple.getOS()) { case llvm::Triple::Win32: return *(TheTargetCodeGenInfo = new WinX86_64TargetCodeGenInfo(Types, HasAVX)); case llvm::Triple::NaCl: return *(TheTargetCodeGenInfo = new NaClX86_64TargetCodeGenInfo(Types, HasAVX)); default: return *(TheTargetCodeGenInfo = new X86_64TargetCodeGenInfo(Types, HasAVX)); } } case llvm::Triple::hexagon: return *(TheTargetCodeGenInfo = new HexagonTargetCodeGenInfo(Types)); case llvm::Triple::r600: return *(TheTargetCodeGenInfo = new AMDGPUTargetCodeGenInfo(Types)); case llvm::Triple::amdgcn: return *(TheTargetCodeGenInfo = new AMDGPUTargetCodeGenInfo(Types)); case llvm::Triple::sparcv9: return *(TheTargetCodeGenInfo = new SparcV9TargetCodeGenInfo(Types)); case llvm::Triple::xcore: return *(TheTargetCodeGenInfo = new XCoreTargetCodeGenInfo(Types)); } } Index: projects/clang360-import/contrib/llvm/tools/clang/lib/Driver/ToolChains.cpp =================================================================== --- projects/clang360-import/contrib/llvm/tools/clang/lib/Driver/ToolChains.cpp (revision 279024) +++ projects/clang360-import/contrib/llvm/tools/clang/lib/Driver/ToolChains.cpp (revision 279025) @@ -1,3489 +1,3490 @@ //===--- ToolChains.cpp - ToolChain Implementations -----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "ToolChains.h" #include "clang/Basic/ObjCRuntime.h" #include "clang/Basic/Version.h" #include "clang/Config/config.h" // for GCC_INSTALL_PREFIX #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/raw_ostream.h" #include // ::getenv #include using namespace clang::driver; using namespace clang::driver::toolchains; using namespace clang; using namespace llvm::opt; MachO::MachO(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : ToolChain(D, Triple, Args) { getProgramPaths().push_back(getDriver().getInstalledDir()); if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); // We expect 'as', 'ld', etc. to be adjacent to our install dir. getProgramPaths().push_back(getDriver().getInstalledDir()); if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); } /// Darwin - Darwin tool chain for i386 and x86_64. Darwin::Darwin(const Driver & D, const llvm::Triple & Triple, const ArgList & Args) : MachO(D, Triple, Args), TargetInitialized(false) { // Compute the initial Darwin version from the triple unsigned Major, Minor, Micro; if (!Triple.getMacOSXVersion(Major, Minor, Micro)) getDriver().Diag(diag::err_drv_invalid_darwin_version) << Triple.getOSName(); llvm::raw_string_ostream(MacosxVersionMin) << Major << '.' << Minor << '.' << Micro; // FIXME: DarwinVersion is only used to find GCC's libexec directory. // It should be removed when we stop supporting that. DarwinVersion[0] = Minor + 4; DarwinVersion[1] = Micro; DarwinVersion[2] = 0; // Compute the initial iOS version from the triple Triple.getiOSVersion(Major, Minor, Micro); llvm::raw_string_ostream(iOSVersionMin) << Major << '.' << Minor << '.' << Micro; } types::ID MachO::LookupTypeForExtension(const char *Ext) const { types::ID Ty = types::lookupTypeForExtension(Ext); // Darwin always preprocesses assembly files (unless -x is used explicitly). if (Ty == types::TY_PP_Asm) return types::TY_Asm; return Ty; } bool MachO::HasNativeLLVMSupport() const { return true; } /// Darwin provides an ARC runtime starting in MacOS X 10.7 and iOS 5.0. ObjCRuntime Darwin::getDefaultObjCRuntime(bool isNonFragile) const { if (isTargetIOSBased()) return ObjCRuntime(ObjCRuntime::iOS, TargetVersion); if (isNonFragile) return ObjCRuntime(ObjCRuntime::MacOSX, TargetVersion); return ObjCRuntime(ObjCRuntime::FragileMacOSX, TargetVersion); } /// Darwin provides a blocks runtime starting in MacOS X 10.6 and iOS 3.2. bool Darwin::hasBlocksRuntime() const { if (isTargetIOSBased()) return !isIPhoneOSVersionLT(3, 2); else { assert(isTargetMacOS() && "unexpected darwin target"); return !isMacosxVersionLT(10, 6); } } static const char *GetArmArchForMArch(StringRef Value) { return llvm::StringSwitch(Value) .Case("armv6k", "armv6") .Case("armv6m", "armv6m") .Case("armv5tej", "armv5") .Case("xscale", "xscale") .Case("armv4t", "armv4t") .Case("armv7", "armv7") .Cases("armv7a", "armv7-a", "armv7") .Cases("armv7r", "armv7-r", "armv7") .Cases("armv7em", "armv7e-m", "armv7em") .Cases("armv7k", "armv7-k", "armv7k") .Cases("armv7m", "armv7-m", "armv7m") .Cases("armv7s", "armv7-s", "armv7s") .Default(nullptr); } static const char *GetArmArchForMCpu(StringRef Value) { return llvm::StringSwitch(Value) .Cases("arm9e", "arm946e-s", "arm966e-s", "arm968e-s", "arm926ej-s","armv5") .Cases("arm10e", "arm10tdmi", "armv5") .Cases("arm1020t", "arm1020e", "arm1022e", "arm1026ej-s", "armv5") .Case("xscale", "xscale") .Cases("arm1136j-s", "arm1136jf-s", "arm1176jz-s", "arm1176jzf-s", "armv6") .Case("cortex-m0", "armv6m") .Cases("cortex-a5", "cortex-a7", "cortex-a8", "armv7") .Cases("cortex-a9", "cortex-a12", "cortex-a15", "cortex-a17", "krait", "armv7") .Cases("cortex-r4", "cortex-r5", "armv7r") .Case("cortex-m3", "armv7m") .Cases("cortex-m4", "cortex-m7", "armv7em") .Case("swift", "armv7s") .Default(nullptr); } static bool isSoftFloatABI(const ArgList &Args) { Arg *A = Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float, options::OPT_mfloat_abi_EQ); if (!A) return false; return A->getOption().matches(options::OPT_msoft_float) || (A->getOption().matches(options::OPT_mfloat_abi_EQ) && A->getValue() == StringRef("soft")); } StringRef MachO::getMachOArchName(const ArgList &Args) const { switch (getTriple().getArch()) { default: return getDefaultUniversalArchName(); case llvm::Triple::aarch64: return "arm64"; case llvm::Triple::thumb: case llvm::Triple::arm: { if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) if (const char *Arch = GetArmArchForMArch(A->getValue())) return Arch; if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) if (const char *Arch = GetArmArchForMCpu(A->getValue())) return Arch; return "arm"; } } } Darwin::~Darwin() { } MachO::~MachO() { } std::string MachO::ComputeEffectiveClangTriple(const ArgList &Args, types::ID InputType) const { llvm::Triple Triple(ComputeLLVMTriple(Args, InputType)); return Triple.getTriple(); } std::string Darwin::ComputeEffectiveClangTriple(const ArgList &Args, types::ID InputType) const { llvm::Triple Triple(ComputeLLVMTriple(Args, InputType)); // If the target isn't initialized (e.g., an unknown Darwin platform, return // the default triple). if (!isTargetInitialized()) return Triple.getTriple(); SmallString<16> Str; Str += isTargetIOSBased() ? "ios" : "macosx"; Str += getTargetVersion().getAsString(); Triple.setOSName(Str); return Triple.getTriple(); } void Generic_ELF::anchor() {} Tool *MachO::getTool(Action::ActionClass AC) const { switch (AC) { case Action::LipoJobClass: if (!Lipo) Lipo.reset(new tools::darwin::Lipo(*this)); return Lipo.get(); case Action::DsymutilJobClass: if (!Dsymutil) Dsymutil.reset(new tools::darwin::Dsymutil(*this)); return Dsymutil.get(); case Action::VerifyDebugInfoJobClass: if (!VerifyDebug) VerifyDebug.reset(new tools::darwin::VerifyDebug(*this)); return VerifyDebug.get(); default: return ToolChain::getTool(AC); } } Tool *MachO::buildLinker() const { return new tools::darwin::Link(*this); } Tool *MachO::buildAssembler() const { return new tools::darwin::Assemble(*this); } DarwinClang::DarwinClang(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Darwin(D, Triple, Args) { } void DarwinClang::addClangWarningOptions(ArgStringList &CC1Args) const { // For iOS, 64-bit, promote certain warnings to errors. if (!isTargetMacOS() && getTriple().isArch64Bit()) { // Always enable -Wdeprecated-objc-isa-usage and promote it // to an error. CC1Args.push_back("-Wdeprecated-objc-isa-usage"); CC1Args.push_back("-Werror=deprecated-objc-isa-usage"); // Also error about implicit function declarations, as that // can impact calling conventions. CC1Args.push_back("-Werror=implicit-function-declaration"); } } /// \brief Determine whether Objective-C automated reference counting is /// enabled. static bool isObjCAutoRefCount(const ArgList &Args) { return Args.hasFlag(options::OPT_fobjc_arc, options::OPT_fno_objc_arc, false); } void DarwinClang::AddLinkARCArgs(const ArgList &Args, ArgStringList &CmdArgs) const { // Avoid linking compatibility stubs on i386 mac. if (isTargetMacOS() && getArch() == llvm::Triple::x86) return; ObjCRuntime runtime = getDefaultObjCRuntime(/*nonfragile*/ true); if ((runtime.hasNativeARC() || !isObjCAutoRefCount(Args)) && runtime.hasSubscripting()) return; CmdArgs.push_back("-force_load"); SmallString<128> P(getDriver().ClangExecutable); llvm::sys::path::remove_filename(P); // 'clang' llvm::sys::path::remove_filename(P); // 'bin' llvm::sys::path::append(P, "lib", "arc", "libarclite_"); // Mash in the platform. if (isTargetIOSSimulator()) P += "iphonesimulator"; else if (isTargetIPhoneOS()) P += "iphoneos"; else P += "macosx"; P += ".a"; CmdArgs.push_back(Args.MakeArgString(P)); } void MachO::AddLinkRuntimeLib(const ArgList &Args, ArgStringList &CmdArgs, StringRef DarwinLibName, bool AlwaysLink, bool IsEmbedded, bool AddRPath) const { SmallString<128> Dir(getDriver().ResourceDir); llvm::sys::path::append(Dir, "lib", IsEmbedded ? "macho_embedded" : "darwin"); SmallString<128> P(Dir); llvm::sys::path::append(P, DarwinLibName); // For now, allow missing resource libraries to support developers who may // not have compiler-rt checked out or integrated into their build (unless // we explicitly force linking with this library). if (AlwaysLink || llvm::sys::fs::exists(P.str())) CmdArgs.push_back(Args.MakeArgString(P.str())); // Adding the rpaths might negatively interact when other rpaths are involved, // so we should make sure we add the rpaths last, after all user-specified // rpaths. This is currently true from this place, but we need to be // careful if this function is ever called before user's rpaths are emitted. if (AddRPath) { assert(DarwinLibName.endswith(".dylib") && "must be a dynamic library"); // Add @executable_path to rpath to support having the dylib copied with // the executable. CmdArgs.push_back("-rpath"); CmdArgs.push_back("@executable_path"); // Add the path to the resource dir to rpath to support using the dylib // from the default location without copying. CmdArgs.push_back("-rpath"); CmdArgs.push_back(Args.MakeArgString(Dir.str())); } } void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, ArgStringList &CmdArgs) const { // Darwin only supports the compiler-rt based runtime libraries. switch (GetRuntimeLibType(Args)) { case ToolChain::RLT_CompilerRT: break; default: getDriver().Diag(diag::err_drv_unsupported_rtlib_for_platform) << Args.getLastArg(options::OPT_rtlib_EQ)->getValue() << "darwin"; return; } // Darwin doesn't support real static executables, don't link any runtime // libraries with -static. if (Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_fapple_kext) || Args.hasArg(options::OPT_mkernel)) return; // Reject -static-libgcc for now, we can deal with this when and if someone // cares. This is useful in situations where someone wants to statically link // something like libstdc++, and needs its runtime support routines. if (const Arg *A = Args.getLastArg(options::OPT_static_libgcc)) { getDriver().Diag(diag::err_drv_unsupported_opt) << A->getAsString(Args); return; } // If we are building profile support, link that library in. if (Args.hasFlag(options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs, false) || Args.hasArg(options::OPT_fprofile_generate) || Args.hasArg(options::OPT_fprofile_instr_generate) || Args.hasArg(options::OPT_fcreate_profile) || Args.hasArg(options::OPT_coverage)) { // Select the appropriate runtime library for the target. if (isTargetIOSBased()) AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.profile_ios.a"); else AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.profile_osx.a"); } const SanitizerArgs &Sanitize = getSanitizerArgs(); // Add Ubsan runtime library, if required. if (Sanitize.needsUbsanRt()) { // FIXME: Move this check to SanitizerArgs::filterUnsupportedKinds. if (isTargetIOSBased()) { getDriver().Diag(diag::err_drv_clang_unsupported_per_platform) << "-fsanitize=undefined"; } else { assert(isTargetMacOS() && "unexpected non OS X target"); AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.ubsan_osx.a", true); // The Ubsan runtime library requires C++. AddCXXStdlibLibArgs(Args, CmdArgs); } } // Add ASAN runtime library, if required. Dynamic libraries and bundles // should not be linked with the runtime library. if (Sanitize.needsAsanRt()) { // FIXME: Move this check to SanitizerArgs::filterUnsupportedKinds. if (isTargetIPhoneOS()) { getDriver().Diag(diag::err_drv_clang_unsupported_per_platform) << "-fsanitize=address"; } else { if (!Args.hasArg(options::OPT_dynamiclib) && !Args.hasArg(options::OPT_bundle)) { // The ASAN runtime library requires C++. AddCXXStdlibLibArgs(Args, CmdArgs); } if (isTargetMacOS()) { AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.asan_osx_dynamic.dylib", /*AlwaysLink*/ true, /*IsEmbedded*/ false, /*AddRPath*/ true); } else { if (isTargetIOSSimulator()) { AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.asan_iossim_dynamic.dylib", /*AlwaysLink*/ true, /*IsEmbedded*/ false, /*AddRPath*/ true); } } } } // Otherwise link libSystem, then the dynamic runtime library, and finally any // target specific static runtime library. CmdArgs.push_back("-lSystem"); // Select the dynamic runtime library and the target specific static library. if (isTargetIOSBased()) { // If we are compiling as iOS / simulator, don't attempt to link libgcc_s.1, // it never went into the SDK. // Linking against libgcc_s.1 isn't needed for iOS 5.0+ if (isIPhoneOSVersionLT(5, 0) && !isTargetIOSSimulator() && getTriple().getArch() != llvm::Triple::aarch64) CmdArgs.push_back("-lgcc_s.1"); // We currently always need a static runtime library for iOS. AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.ios.a"); } else { assert(isTargetMacOS() && "unexpected non MacOS platform"); // The dynamic runtime library was merged with libSystem for 10.6 and // beyond; only 10.4 and 10.5 need an additional runtime library. if (isMacosxVersionLT(10, 5)) CmdArgs.push_back("-lgcc_s.10.4"); else if (isMacosxVersionLT(10, 6)) CmdArgs.push_back("-lgcc_s.10.5"); // For OS X, we thought we would only need a static runtime library when // targeting 10.4, to provide versions of the static functions which were // omitted from 10.4.dylib. // // Unfortunately, that turned out to not be true, because Darwin system // headers can still use eprintf on i386, and it is not exported from // libSystem. Therefore, we still must provide a runtime library just for // the tiny tiny handful of projects that *might* use that symbol. if (isMacosxVersionLT(10, 5)) { AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.10.4.a"); } else { if (getTriple().getArch() == llvm::Triple::x86) AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.eprintf.a"); AddLinkRuntimeLib(Args, CmdArgs, "libclang_rt.osx.a"); } } } void Darwin::AddDeploymentTarget(DerivedArgList &Args) const { const OptTable &Opts = getDriver().getOpts(); // Support allowing the SDKROOT environment variable used by xcrun and other // Xcode tools to define the default sysroot, by making it the default for // isysroot. if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) { // Warn if the path does not exist. if (!llvm::sys::fs::exists(A->getValue())) getDriver().Diag(clang::diag::warn_missing_sysroot) << A->getValue(); } else { if (char *env = ::getenv("SDKROOT")) { // We only use this value as the default if it is an absolute path, // exists, and it is not the root path. if (llvm::sys::path::is_absolute(env) && llvm::sys::fs::exists(env) && StringRef(env) != "/") { Args.append(Args.MakeSeparateArg( nullptr, Opts.getOption(options::OPT_isysroot), env)); } } } Arg *OSXVersion = Args.getLastArg(options::OPT_mmacosx_version_min_EQ); Arg *iOSVersion = Args.getLastArg(options::OPT_miphoneos_version_min_EQ); if (OSXVersion && iOSVersion) { getDriver().Diag(diag::err_drv_argument_not_allowed_with) << OSXVersion->getAsString(Args) << iOSVersion->getAsString(Args); iOSVersion = nullptr; } else if (!OSXVersion && !iOSVersion) { // If no deployment target was specified on the command line, check for // environment defines. StringRef OSXTarget; StringRef iOSTarget; if (char *env = ::getenv("MACOSX_DEPLOYMENT_TARGET")) OSXTarget = env; if (char *env = ::getenv("IPHONEOS_DEPLOYMENT_TARGET")) iOSTarget = env; // If no '-miphoneos-version-min' specified on the command line and // IPHONEOS_DEPLOYMENT_TARGET is not defined, see if we can set the default // based on -isysroot. if (iOSTarget.empty()) { if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) { StringRef first, second; StringRef isysroot = A->getValue(); std::tie(first, second) = isysroot.split(StringRef("SDKs/iPhoneOS")); if (second != "") iOSTarget = second.substr(0,3); } } // If no OSX or iOS target has been specified and we're compiling for armv7, // go ahead as assume we're targeting iOS. StringRef MachOArchName = getMachOArchName(Args); if (OSXTarget.empty() && iOSTarget.empty() && (MachOArchName == "armv7" || MachOArchName == "armv7s" || MachOArchName == "arm64")) iOSTarget = iOSVersionMin; // Allow conflicts among OSX and iOS for historical reasons, but choose the // default platform. if (!OSXTarget.empty() && !iOSTarget.empty()) { if (getTriple().getArch() == llvm::Triple::arm || getTriple().getArch() == llvm::Triple::aarch64 || getTriple().getArch() == llvm::Triple::thumb) OSXTarget = ""; else iOSTarget = ""; } if (!OSXTarget.empty()) { const Option O = Opts.getOption(options::OPT_mmacosx_version_min_EQ); OSXVersion = Args.MakeJoinedArg(nullptr, O, OSXTarget); Args.append(OSXVersion); } else if (!iOSTarget.empty()) { const Option O = Opts.getOption(options::OPT_miphoneos_version_min_EQ); iOSVersion = Args.MakeJoinedArg(nullptr, O, iOSTarget); Args.append(iOSVersion); } else if (MachOArchName != "armv6m" && MachOArchName != "armv7m" && MachOArchName != "armv7em") { // Otherwise, assume we are targeting OS X. const Option O = Opts.getOption(options::OPT_mmacosx_version_min_EQ); OSXVersion = Args.MakeJoinedArg(nullptr, O, MacosxVersionMin); Args.append(OSXVersion); } } DarwinPlatformKind Platform; if (OSXVersion) Platform = MacOS; else if (iOSVersion) Platform = IPhoneOS; else llvm_unreachable("Unable to infer Darwin variant"); // Set the tool chain target information. unsigned Major, Minor, Micro; bool HadExtra; if (Platform == MacOS) { assert(!iOSVersion && "Unknown target platform!"); if (!Driver::GetReleaseVersion(OSXVersion->getValue(), Major, Minor, Micro, HadExtra) || HadExtra || Major != 10 || Minor >= 100 || Micro >= 100) getDriver().Diag(diag::err_drv_invalid_version_number) << OSXVersion->getAsString(Args); } else if (Platform == IPhoneOS) { assert(iOSVersion && "Unknown target platform!"); if (!Driver::GetReleaseVersion(iOSVersion->getValue(), Major, Minor, Micro, HadExtra) || HadExtra || Major >= 10 || Minor >= 100 || Micro >= 100) getDriver().Diag(diag::err_drv_invalid_version_number) << iOSVersion->getAsString(Args); } else llvm_unreachable("unknown kind of Darwin platform"); // Recognize iOS targets with an x86 architecture as the iOS simulator. if (iOSVersion && (getTriple().getArch() == llvm::Triple::x86 || getTriple().getArch() == llvm::Triple::x86_64)) Platform = IPhoneOSSimulator; setTarget(Platform, Major, Minor, Micro); } void DarwinClang::AddCXXStdlibLibArgs(const ArgList &Args, ArgStringList &CmdArgs) const { CXXStdlibType Type = GetCXXStdlibType(Args); switch (Type) { case ToolChain::CST_Libcxx: CmdArgs.push_back("-lc++"); break; case ToolChain::CST_Libstdcxx: { // Unfortunately, -lstdc++ doesn't always exist in the standard search path; // it was previously found in the gcc lib dir. However, for all the Darwin // platforms we care about it was -lstdc++.6, so we search for that // explicitly if we can't see an obvious -lstdc++ candidate. // Check in the sysroot first. if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) { SmallString<128> P(A->getValue()); llvm::sys::path::append(P, "usr", "lib", "libstdc++.dylib"); if (!llvm::sys::fs::exists(P.str())) { llvm::sys::path::remove_filename(P); llvm::sys::path::append(P, "libstdc++.6.dylib"); if (llvm::sys::fs::exists(P.str())) { CmdArgs.push_back(Args.MakeArgString(P.str())); return; } } } // Otherwise, look in the root. // FIXME: This should be removed someday when we don't have to care about // 10.6 and earlier, where /usr/lib/libstdc++.dylib does not exist. if (!llvm::sys::fs::exists("/usr/lib/libstdc++.dylib") && llvm::sys::fs::exists("/usr/lib/libstdc++.6.dylib")) { CmdArgs.push_back("/usr/lib/libstdc++.6.dylib"); return; } // Otherwise, let the linker search. CmdArgs.push_back("-lstdc++"); break; } } } void DarwinClang::AddCCKextLibArgs(const ArgList &Args, ArgStringList &CmdArgs) const { // For Darwin platforms, use the compiler-rt-based support library // instead of the gcc-provided one (which is also incidentally // only present in the gcc lib dir, which makes it hard to find). SmallString<128> P(getDriver().ResourceDir); llvm::sys::path::append(P, "lib", "darwin"); // Use the newer cc_kext for iOS ARM after 6.0. if (!isTargetIPhoneOS() || isTargetIOSSimulator() || getTriple().getArch() == llvm::Triple::aarch64 || !isIPhoneOSVersionLT(6, 0)) { llvm::sys::path::append(P, "libclang_rt.cc_kext.a"); } else { llvm::sys::path::append(P, "libclang_rt.cc_kext_ios5.a"); } // For now, allow missing resource libraries to support developers who may // not have compiler-rt checked out or integrated into their build. if (llvm::sys::fs::exists(P.str())) CmdArgs.push_back(Args.MakeArgString(P.str())); } DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args, const char *BoundArch) const { DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs()); const OptTable &Opts = getDriver().getOpts(); // FIXME: We really want to get out of the tool chain level argument // translation business, as it makes the driver functionality much // more opaque. For now, we follow gcc closely solely for the // purpose of easily achieving feature parity & testability. Once we // have something that works, we should reevaluate each translation // and try to push it down into tool specific logic. for (Arg *A : Args) { if (A->getOption().matches(options::OPT_Xarch__)) { // Skip this argument unless the architecture matches either the toolchain // triple arch, or the arch being bound. llvm::Triple::ArchType XarchArch = tools::darwin::getArchTypeForMachOArchName(A->getValue(0)); if (!(XarchArch == getArch() || (BoundArch && XarchArch == tools::darwin::getArchTypeForMachOArchName(BoundArch)))) continue; Arg *OriginalArg = A; unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); unsigned Prev = Index; std::unique_ptr XarchArg(Opts.ParseOneArg(Args, Index)); // If the argument parsing failed or more than one argument was // consumed, the -Xarch_ argument's parameter tried to consume // extra arguments. Emit an error and ignore. // // We also want to disallow any options which would alter the // driver behavior; that isn't going to work in our model. We // use isDriverOption() as an approximation, although things // like -O4 are going to slip through. if (!XarchArg || Index > Prev + 1) { getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) << A->getAsString(Args); continue; } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) << A->getAsString(Args); continue; } XarchArg->setBaseArg(A); A = XarchArg.release(); DAL->AddSynthesizedArg(A); // Linker input arguments require custom handling. The problem is that we // have already constructed the phase actions, so we can not treat them as // "input arguments". if (A->getOption().hasFlag(options::LinkerInput)) { // Convert the argument into individual Zlinker_input_args. for (unsigned i = 0, e = A->getNumValues(); i != e; ++i) { DAL->AddSeparateArg(OriginalArg, Opts.getOption(options::OPT_Zlinker_input), A->getValue(i)); } continue; } } // Sob. These is strictly gcc compatible for the time being. Apple // gcc translates options twice, which means that self-expanding // options add duplicates. switch ((options::ID) A->getOption().getID()) { default: DAL->append(A); break; case options::OPT_mkernel: case options::OPT_fapple_kext: DAL->append(A); DAL->AddFlagArg(A, Opts.getOption(options::OPT_static)); break; case options::OPT_dependency_file: DAL->AddSeparateArg(A, Opts.getOption(options::OPT_MF), A->getValue()); break; case options::OPT_gfull: DAL->AddFlagArg(A, Opts.getOption(options::OPT_g_Flag)); DAL->AddFlagArg(A, Opts.getOption(options::OPT_fno_eliminate_unused_debug_symbols)); break; case options::OPT_gused: DAL->AddFlagArg(A, Opts.getOption(options::OPT_g_Flag)); DAL->AddFlagArg(A, Opts.getOption(options::OPT_feliminate_unused_debug_symbols)); break; case options::OPT_shared: DAL->AddFlagArg(A, Opts.getOption(options::OPT_dynamiclib)); break; case options::OPT_fconstant_cfstrings: DAL->AddFlagArg(A, Opts.getOption(options::OPT_mconstant_cfstrings)); break; case options::OPT_fno_constant_cfstrings: DAL->AddFlagArg(A, Opts.getOption(options::OPT_mno_constant_cfstrings)); break; case options::OPT_Wnonportable_cfstrings: DAL->AddFlagArg(A, Opts.getOption(options::OPT_mwarn_nonportable_cfstrings)); break; case options::OPT_Wno_nonportable_cfstrings: DAL->AddFlagArg(A, Opts.getOption(options::OPT_mno_warn_nonportable_cfstrings)); break; case options::OPT_fpascal_strings: DAL->AddFlagArg(A, Opts.getOption(options::OPT_mpascal_strings)); break; case options::OPT_fno_pascal_strings: DAL->AddFlagArg(A, Opts.getOption(options::OPT_mno_pascal_strings)); break; } } if (getTriple().getArch() == llvm::Triple::x86 || getTriple().getArch() == llvm::Triple::x86_64) if (!Args.hasArgNoClaim(options::OPT_mtune_EQ)) DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_mtune_EQ), "core2"); // Add the arch options based on the particular spelling of -arch, to match // how the driver driver works. if (BoundArch) { StringRef Name = BoundArch; const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ); const Option MArch = Opts.getOption(options::OPT_march_EQ); // This code must be kept in sync with LLVM's getArchTypeForDarwinArch, // which defines the list of which architectures we accept. if (Name == "ppc") ; else if (Name == "ppc601") DAL->AddJoinedArg(nullptr, MCpu, "601"); else if (Name == "ppc603") DAL->AddJoinedArg(nullptr, MCpu, "603"); else if (Name == "ppc604") DAL->AddJoinedArg(nullptr, MCpu, "604"); else if (Name == "ppc604e") DAL->AddJoinedArg(nullptr, MCpu, "604e"); else if (Name == "ppc750") DAL->AddJoinedArg(nullptr, MCpu, "750"); else if (Name == "ppc7400") DAL->AddJoinedArg(nullptr, MCpu, "7400"); else if (Name == "ppc7450") DAL->AddJoinedArg(nullptr, MCpu, "7450"); else if (Name == "ppc970") DAL->AddJoinedArg(nullptr, MCpu, "970"); else if (Name == "ppc64" || Name == "ppc64le") DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_m64)); else if (Name == "i386") ; else if (Name == "i486") DAL->AddJoinedArg(nullptr, MArch, "i486"); else if (Name == "i586") DAL->AddJoinedArg(nullptr, MArch, "i586"); else if (Name == "i686") DAL->AddJoinedArg(nullptr, MArch, "i686"); else if (Name == "pentium") DAL->AddJoinedArg(nullptr, MArch, "pentium"); else if (Name == "pentium2") DAL->AddJoinedArg(nullptr, MArch, "pentium2"); else if (Name == "pentpro") DAL->AddJoinedArg(nullptr, MArch, "pentiumpro"); else if (Name == "pentIIm3") DAL->AddJoinedArg(nullptr, MArch, "pentium2"); else if (Name == "x86_64") DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_m64)); else if (Name == "x86_64h") { DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_m64)); DAL->AddJoinedArg(nullptr, MArch, "x86_64h"); } else if (Name == "arm") DAL->AddJoinedArg(nullptr, MArch, "armv4t"); else if (Name == "armv4t") DAL->AddJoinedArg(nullptr, MArch, "armv4t"); else if (Name == "armv5") DAL->AddJoinedArg(nullptr, MArch, "armv5tej"); else if (Name == "xscale") DAL->AddJoinedArg(nullptr, MArch, "xscale"); else if (Name == "armv6") DAL->AddJoinedArg(nullptr, MArch, "armv6k"); else if (Name == "armv6m") DAL->AddJoinedArg(nullptr, MArch, "armv6m"); else if (Name == "armv7") DAL->AddJoinedArg(nullptr, MArch, "armv7a"); else if (Name == "armv7em") DAL->AddJoinedArg(nullptr, MArch, "armv7em"); else if (Name == "armv7k") DAL->AddJoinedArg(nullptr, MArch, "armv7k"); else if (Name == "armv7m") DAL->AddJoinedArg(nullptr, MArch, "armv7m"); else if (Name == "armv7s") DAL->AddJoinedArg(nullptr, MArch, "armv7s"); } return DAL; } void MachO::AddLinkRuntimeLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { // Embedded targets are simple at the moment, not supporting sanitizers and // with different libraries for each member of the product { static, PIC } x // { hard-float, soft-float } llvm::SmallString<32> CompilerRT = StringRef("libclang_rt."); CompilerRT += tools::arm::getARMFloatABI(getDriver(), Args, getTriple()) == "hard" ? "hard" : "soft"; CompilerRT += Args.hasArg(options::OPT_fPIC) ? "_pic.a" : "_static.a"; AddLinkRuntimeLib(Args, CmdArgs, CompilerRT, false, true); } DerivedArgList *Darwin::TranslateArgs(const DerivedArgList &Args, const char *BoundArch) const { // First get the generic Apple args, before moving onto Darwin-specific ones. DerivedArgList *DAL = MachO::TranslateArgs(Args, BoundArch); const OptTable &Opts = getDriver().getOpts(); // If no architecture is bound, none of the translations here are relevant. if (!BoundArch) return DAL; // Add an explicit version min argument for the deployment target. We do this // after argument translation because -Xarch_ arguments may add a version min // argument. AddDeploymentTarget(*DAL); // For iOS 6, undo the translation to add -static for -mkernel/-fapple-kext. // FIXME: It would be far better to avoid inserting those -static arguments, // but we can't check the deployment target in the translation code until // it is set here. if (isTargetIOSBased() && !isIPhoneOSVersionLT(6, 0)) { for (ArgList::iterator it = DAL->begin(), ie = DAL->end(); it != ie; ) { Arg *A = *it; ++it; if (A->getOption().getID() != options::OPT_mkernel && A->getOption().getID() != options::OPT_fapple_kext) continue; assert(it != ie && "unexpected argument translation"); A = *it; assert(A->getOption().getID() == options::OPT_static && "missing expected -static argument"); it = DAL->getArgs().erase(it); } } // Default to use libc++ on OS X 10.9+ and iOS 7+. if (((isTargetMacOS() && !isMacosxVersionLT(10, 9)) || (isTargetIOSBased() && !isIPhoneOSVersionLT(7, 0))) && !Args.getLastArg(options::OPT_stdlib_EQ)) DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_stdlib_EQ), "libc++"); // Validate the C++ standard library choice. CXXStdlibType Type = GetCXXStdlibType(*DAL); if (Type == ToolChain::CST_Libcxx) { // Check whether the target provides libc++. StringRef where; // Complain about targeting iOS < 5.0 in any way. if (isTargetIOSBased() && isIPhoneOSVersionLT(5, 0)) where = "iOS 5.0"; if (where != StringRef()) { getDriver().Diag(clang::diag::err_drv_invalid_libcxx_deployment) << where; } } return DAL; } bool MachO::IsUnwindTablesDefault() const { return getArch() == llvm::Triple::x86_64; } bool MachO::UseDwarfDebugFlags() const { if (const char *S = ::getenv("RC_DEBUG_OPTIONS")) return S[0] != '\0'; return false; } bool Darwin::UseSjLjExceptions() const { // Darwin uses SjLj exceptions on ARM. return (getTriple().getArch() == llvm::Triple::arm || getTriple().getArch() == llvm::Triple::thumb); } bool MachO::isPICDefault() const { return true; } bool MachO::isPIEDefault() const { return false; } bool MachO::isPICDefaultForced() const { return (getArch() == llvm::Triple::x86_64 || getArch() == llvm::Triple::aarch64); } bool MachO::SupportsProfiling() const { // Profiling instrumentation is only supported on x86. return getArch() == llvm::Triple::x86 || getArch() == llvm::Triple::x86_64; } void Darwin::addMinVersionArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { VersionTuple TargetVersion = getTargetVersion(); if (isTargetIOSSimulator()) CmdArgs.push_back("-ios_simulator_version_min"); else if (isTargetIOSBased()) CmdArgs.push_back("-iphoneos_version_min"); else { assert(isTargetMacOS() && "unexpected target"); CmdArgs.push_back("-macosx_version_min"); } CmdArgs.push_back(Args.MakeArgString(TargetVersion.getAsString())); } void Darwin::addStartObjectFileArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { // Derived from startfile spec. if (Args.hasArg(options::OPT_dynamiclib)) { // Derived from darwin_dylib1 spec. if (isTargetIOSSimulator()) { ; // iOS simulator does not need dylib1.o. } else if (isTargetIPhoneOS()) { if (isIPhoneOSVersionLT(3, 1)) CmdArgs.push_back("-ldylib1.o"); } else { if (isMacosxVersionLT(10, 5)) CmdArgs.push_back("-ldylib1.o"); else if (isMacosxVersionLT(10, 6)) CmdArgs.push_back("-ldylib1.10.5.o"); } } else { if (Args.hasArg(options::OPT_bundle)) { if (!Args.hasArg(options::OPT_static)) { // Derived from darwin_bundle1 spec. if (isTargetIOSSimulator()) { ; // iOS simulator does not need bundle1.o. } else if (isTargetIPhoneOS()) { if (isIPhoneOSVersionLT(3, 1)) CmdArgs.push_back("-lbundle1.o"); } else { if (isMacosxVersionLT(10, 6)) CmdArgs.push_back("-lbundle1.o"); } } } else { if (Args.hasArg(options::OPT_pg) && SupportsProfiling()) { if (Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_object) || Args.hasArg(options::OPT_preload)) { CmdArgs.push_back("-lgcrt0.o"); } else { CmdArgs.push_back("-lgcrt1.o"); // darwin_crt2 spec is empty. } // By default on OS X 10.8 and later, we don't link with a crt1.o // file and the linker knows to use _main as the entry point. But, // when compiling with -pg, we need to link with the gcrt1.o file, // so pass the -no_new_main option to tell the linker to use the // "start" symbol as the entry point. if (isTargetMacOS() && !isMacosxVersionLT(10, 8)) CmdArgs.push_back("-no_new_main"); } else { if (Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_object) || Args.hasArg(options::OPT_preload)) { CmdArgs.push_back("-lcrt0.o"); } else { // Derived from darwin_crt1 spec. if (isTargetIOSSimulator()) { ; // iOS simulator does not need crt1.o. } else if (isTargetIPhoneOS()) { if (getArch() == llvm::Triple::aarch64) ; // iOS does not need any crt1 files for arm64 else if (isIPhoneOSVersionLT(3, 1)) CmdArgs.push_back("-lcrt1.o"); else if (isIPhoneOSVersionLT(6, 0)) CmdArgs.push_back("-lcrt1.3.1.o"); } else { if (isMacosxVersionLT(10, 5)) CmdArgs.push_back("-lcrt1.o"); else if (isMacosxVersionLT(10, 6)) CmdArgs.push_back("-lcrt1.10.5.o"); else if (isMacosxVersionLT(10, 8)) CmdArgs.push_back("-lcrt1.10.6.o"); // darwin_crt2 spec is empty. } } } } } if (!isTargetIPhoneOS() && Args.hasArg(options::OPT_shared_libgcc) && isMacosxVersionLT(10, 5)) { const char *Str = Args.MakeArgString(GetFilePath("crt3.o")); CmdArgs.push_back(Str); } } bool Darwin::SupportsObjCGC() const { return isTargetMacOS(); } void Darwin::CheckObjCARC() const { if (isTargetIOSBased()|| (isTargetMacOS() && !isMacosxVersionLT(10, 6))) return; getDriver().Diag(diag::err_arc_unsupported_on_toolchain); } /// Generic_GCC - A tool chain using the 'gcc' command to perform /// all subcommands; this relies on gcc translating the majority of /// command line options. /// \brief Parse a GCCVersion object out of a string of text. /// /// This is the primary means of forming GCCVersion objects. /*static*/ Generic_GCC::GCCVersion Linux::GCCVersion::Parse(StringRef VersionText) { const GCCVersion BadVersion = { VersionText.str(), -1, -1, -1, "", "", "" }; std::pair First = VersionText.split('.'); std::pair Second = First.second.split('.'); GCCVersion GoodVersion = { VersionText.str(), -1, -1, -1, "", "", "" }; if (First.first.getAsInteger(10, GoodVersion.Major) || GoodVersion.Major < 0) return BadVersion; GoodVersion.MajorStr = First.first.str(); if (Second.first.getAsInteger(10, GoodVersion.Minor) || GoodVersion.Minor < 0) return BadVersion; GoodVersion.MinorStr = Second.first.str(); // First look for a number prefix and parse that if present. Otherwise just // stash the entire patch string in the suffix, and leave the number // unspecified. This covers versions strings such as: // 4.4 // 4.4.0 // 4.4.x // 4.4.2-rc4 // 4.4.x-patched // And retains any patch number it finds. StringRef PatchText = GoodVersion.PatchSuffix = Second.second.str(); if (!PatchText.empty()) { if (size_t EndNumber = PatchText.find_first_not_of("0123456789")) { // Try to parse the number and any suffix. if (PatchText.slice(0, EndNumber).getAsInteger(10, GoodVersion.Patch) || GoodVersion.Patch < 0) return BadVersion; GoodVersion.PatchSuffix = PatchText.substr(EndNumber); } } return GoodVersion; } /// \brief Less-than for GCCVersion, implementing a Strict Weak Ordering. bool Generic_GCC::GCCVersion::isOlderThan(int RHSMajor, int RHSMinor, int RHSPatch, StringRef RHSPatchSuffix) const { if (Major != RHSMajor) return Major < RHSMajor; if (Minor != RHSMinor) return Minor < RHSMinor; if (Patch != RHSPatch) { // Note that versions without a specified patch sort higher than those with // a patch. if (RHSPatch == -1) return true; if (Patch == -1) return false; // Otherwise just sort on the patch itself. return Patch < RHSPatch; } if (PatchSuffix != RHSPatchSuffix) { // Sort empty suffixes higher. if (RHSPatchSuffix.empty()) return true; if (PatchSuffix.empty()) return false; // Provide a lexicographic sort to make this a total ordering. return PatchSuffix < RHSPatchSuffix; } // The versions are equal. return false; } static llvm::StringRef getGCCToolchainDir(const ArgList &Args) { const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain); if (A) return A->getValue(); return GCC_INSTALL_PREFIX; } /// \brief Initialize a GCCInstallationDetector from the driver. /// /// This performs all of the autodetection and sets up the various paths. /// Once constructed, a GCCInstallationDetector is essentially immutable. /// /// FIXME: We shouldn't need an explicit TargetTriple parameter here, and /// should instead pull the target out of the driver. This is currently /// necessary because the driver doesn't store the final version of the target /// triple. void Generic_GCC::GCCInstallationDetector::init( const Driver &D, const llvm::Triple &TargetTriple, const ArgList &Args) { llvm::Triple BiarchVariantTriple = TargetTriple.isArch32Bit() ? TargetTriple.get64BitArchVariant() : TargetTriple.get32BitArchVariant(); // The library directories which may contain GCC installations. SmallVector CandidateLibDirs, CandidateBiarchLibDirs; // The compatible GCC triples for this particular architecture. SmallVector CandidateTripleAliases; SmallVector CandidateBiarchTripleAliases; CollectLibDirsAndTriples(TargetTriple, BiarchVariantTriple, CandidateLibDirs, CandidateTripleAliases, CandidateBiarchLibDirs, CandidateBiarchTripleAliases); // Compute the set of prefixes for our search. SmallVector Prefixes(D.PrefixDirs.begin(), D.PrefixDirs.end()); StringRef GCCToolchainDir = getGCCToolchainDir(Args); if (GCCToolchainDir != "") { if (GCCToolchainDir.back() == '/') GCCToolchainDir = GCCToolchainDir.drop_back(); // remove the / Prefixes.push_back(GCCToolchainDir); } else { // If we have a SysRoot, try that first. if (!D.SysRoot.empty()) { Prefixes.push_back(D.SysRoot); Prefixes.push_back(D.SysRoot + "/usr"); } // Then look for gcc installed alongside clang. Prefixes.push_back(D.InstalledDir + "/.."); // And finally in /usr. if (D.SysRoot.empty()) Prefixes.push_back("/usr"); } // Loop over the various components which exist and select the best GCC // installation available. GCC installs are ranked by version number. Version = GCCVersion::Parse("0.0.0"); for (unsigned i = 0, ie = Prefixes.size(); i < ie; ++i) { if (!llvm::sys::fs::exists(Prefixes[i])) continue; for (unsigned j = 0, je = CandidateLibDirs.size(); j < je; ++j) { const std::string LibDir = Prefixes[i] + CandidateLibDirs[j].str(); if (!llvm::sys::fs::exists(LibDir)) continue; for (unsigned k = 0, ke = CandidateTripleAliases.size(); k < ke; ++k) ScanLibDirForGCCTriple(TargetTriple, Args, LibDir, CandidateTripleAliases[k]); } for (unsigned j = 0, je = CandidateBiarchLibDirs.size(); j < je; ++j) { const std::string LibDir = Prefixes[i] + CandidateBiarchLibDirs[j].str(); if (!llvm::sys::fs::exists(LibDir)) continue; for (unsigned k = 0, ke = CandidateBiarchTripleAliases.size(); k < ke; ++k) ScanLibDirForGCCTriple(TargetTriple, Args, LibDir, CandidateBiarchTripleAliases[k], /*NeedsBiarchSuffix=*/ true); } } } void Generic_GCC::GCCInstallationDetector::print(raw_ostream &OS) const { for (const auto &InstallPath : CandidateGCCInstallPaths) OS << "Found candidate GCC installation: " << InstallPath << "\n"; if (!GCCInstallPath.empty()) OS << "Selected GCC installation: " << GCCInstallPath << "\n"; for (const auto &Multilib : Multilibs) OS << "Candidate multilib: " << Multilib << "\n"; if (Multilibs.size() != 0 || !SelectedMultilib.isDefault()) OS << "Selected multilib: " << SelectedMultilib << "\n"; } bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const { if (BiarchSibling.hasValue()) { M = BiarchSibling.getValue(); return true; } return false; } /*static*/ void Generic_GCC::GCCInstallationDetector::CollectLibDirsAndTriples( const llvm::Triple &TargetTriple, const llvm::Triple &BiarchTriple, SmallVectorImpl &LibDirs, SmallVectorImpl &TripleAliases, SmallVectorImpl &BiarchLibDirs, SmallVectorImpl &BiarchTripleAliases) { // Declare a bunch of static data sets that we'll select between below. These // are specifically designed to always refer to string literals to avoid any // lifetime or initialization issues. static const char *const AArch64LibDirs[] = { "/lib64", "/lib" }; static const char *const AArch64Triples[] = { "aarch64-none-linux-gnu", "aarch64-linux-gnu", "aarch64-linux-android", "aarch64-redhat-linux" }; static const char *const AArch64beLibDirs[] = { "/lib" }; static const char *const AArch64beTriples[] = { "aarch64_be-none-linux-gnu", "aarch64_be-linux-gnu" }; static const char *const ARMLibDirs[] = { "/lib" }; static const char *const ARMTriples[] = { "arm-linux-gnueabi", "arm-linux-androideabi" }; static const char *const ARMHFTriples[] = { "arm-linux-gnueabihf", "armv7hl-redhat-linux-gnueabi" }; static const char *const ARMebLibDirs[] = { "/lib" }; static const char *const ARMebTriples[] = { "armeb-linux-gnueabi", "armeb-linux-androideabi" }; static const char *const ARMebHFTriples[] = { "armeb-linux-gnueabihf", "armebv7hl-redhat-linux-gnueabi" }; static const char *const X86_64LibDirs[] = { "/lib64", "/lib" }; static const char *const X86_64Triples[] = { "x86_64-linux-gnu", "x86_64-unknown-linux-gnu", "x86_64-pc-linux-gnu", "x86_64-redhat-linux6E", "x86_64-redhat-linux", "x86_64-suse-linux", "x86_64-manbo-linux-gnu", "x86_64-linux-gnu", "x86_64-slackware-linux", "x86_64-linux-android", "x86_64-unknown-linux" }; static const char *const X32LibDirs[] = { "/libx32" }; static const char *const X86LibDirs[] = { "/lib32", "/lib" }; static const char *const X86Triples[] = { "i686-linux-gnu", "i686-pc-linux-gnu", "i486-linux-gnu", "i386-linux-gnu", "i386-redhat-linux6E", "i686-redhat-linux", "i586-redhat-linux", "i386-redhat-linux", "i586-suse-linux", "i486-slackware-linux", "i686-montavista-linux", "i686-linux-android", "i586-linux-gnu" }; static const char *const MIPSLibDirs[] = { "/lib" }; static const char *const MIPSTriples[] = { "mips-linux-gnu", "mips-mti-linux-gnu", "mips-img-linux-gnu" }; static const char *const MIPSELLibDirs[] = { "/lib" }; static const char *const MIPSELTriples[] = { "mipsel-linux-gnu", "mipsel-linux-android", "mips-img-linux-gnu" }; static const char *const MIPS64LibDirs[] = { "/lib64", "/lib" }; static const char *const MIPS64Triples[] = { "mips64-linux-gnu", "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64-linux-gnuabi64" }; static const char *const MIPS64ELLibDirs[] = { "/lib64", "/lib" }; static const char *const MIPS64ELTriples[] = { "mips64el-linux-gnu", "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64el-linux-android", "mips64el-linux-gnuabi64" }; static const char *const PPCLibDirs[] = { "/lib32", "/lib" }; static const char *const PPCTriples[] = { "powerpc-linux-gnu", "powerpc-unknown-linux-gnu", "powerpc-linux-gnuspe", "powerpc-suse-linux", "powerpc-montavista-linuxspe" }; static const char *const PPC64LibDirs[] = { "/lib64", "/lib" }; static const char *const PPC64Triples[] = { "powerpc64-linux-gnu", "powerpc64-unknown-linux-gnu", "powerpc64-suse-linux", "ppc64-redhat-linux" }; static const char *const PPC64LELibDirs[] = { "/lib64", "/lib" }; static const char *const PPC64LETriples[] = { "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu", "powerpc64le-suse-linux", "ppc64le-redhat-linux" }; static const char *const SPARCv8LibDirs[] = { "/lib32", "/lib" }; static const char *const SPARCv8Triples[] = { "sparc-linux-gnu", "sparcv8-linux-gnu" }; static const char *const SPARCv9LibDirs[] = { "/lib64", "/lib" }; static const char *const SPARCv9Triples[] = { "sparc64-linux-gnu", "sparcv9-linux-gnu" }; static const char *const SystemZLibDirs[] = { "/lib64", "/lib" }; static const char *const SystemZTriples[] = { "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", "s390x-suse-linux", "s390x-redhat-linux" }; using std::begin; using std::end; switch (TargetTriple.getArch()) { case llvm::Triple::aarch64: LibDirs.append(begin(AArch64LibDirs), end(AArch64LibDirs)); TripleAliases.append(begin(AArch64Triples), end(AArch64Triples)); BiarchLibDirs.append(begin(AArch64LibDirs), end(AArch64LibDirs)); BiarchTripleAliases.append(begin(AArch64Triples), end(AArch64Triples)); break; case llvm::Triple::aarch64_be: LibDirs.append(begin(AArch64beLibDirs), end(AArch64beLibDirs)); TripleAliases.append(begin(AArch64beTriples), end(AArch64beTriples)); BiarchLibDirs.append(begin(AArch64beLibDirs), end(AArch64beLibDirs)); BiarchTripleAliases.append(begin(AArch64beTriples), end(AArch64beTriples)); break; case llvm::Triple::arm: case llvm::Triple::thumb: LibDirs.append(begin(ARMLibDirs), end(ARMLibDirs)); if (TargetTriple.getEnvironment() == llvm::Triple::GNUEABIHF) { TripleAliases.append(begin(ARMHFTriples), end(ARMHFTriples)); } else { TripleAliases.append(begin(ARMTriples), end(ARMTriples)); } break; case llvm::Triple::armeb: case llvm::Triple::thumbeb: LibDirs.append(begin(ARMebLibDirs), end(ARMebLibDirs)); if (TargetTriple.getEnvironment() == llvm::Triple::GNUEABIHF) { TripleAliases.append(begin(ARMebHFTriples), end(ARMebHFTriples)); } else { TripleAliases.append(begin(ARMebTriples), end(ARMebTriples)); } break; case llvm::Triple::x86_64: LibDirs.append(begin(X86_64LibDirs), end(X86_64LibDirs)); TripleAliases.append(begin(X86_64Triples), end(X86_64Triples)); // x32 is always available when x86_64 is available, so adding it as // secondary arch with x86_64 triples if (TargetTriple.getEnvironment() == llvm::Triple::GNUX32) { BiarchLibDirs.append(begin(X32LibDirs), end(X32LibDirs)); BiarchTripleAliases.append(begin(X86_64Triples), end(X86_64Triples)); } else { BiarchLibDirs.append(begin(X86LibDirs), end(X86LibDirs)); BiarchTripleAliases.append(begin(X86Triples), end(X86Triples)); } break; case llvm::Triple::x86: LibDirs.append(begin(X86LibDirs), end(X86LibDirs)); TripleAliases.append(begin(X86Triples), end(X86Triples)); BiarchLibDirs.append(begin(X86_64LibDirs), end(X86_64LibDirs)); BiarchTripleAliases.append(begin(X86_64Triples), end(X86_64Triples)); break; case llvm::Triple::mips: LibDirs.append(begin(MIPSLibDirs), end(MIPSLibDirs)); TripleAliases.append(begin(MIPSTriples), end(MIPSTriples)); BiarchLibDirs.append(begin(MIPS64LibDirs), end(MIPS64LibDirs)); BiarchTripleAliases.append(begin(MIPS64Triples), end(MIPS64Triples)); break; case llvm::Triple::mipsel: LibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs)); TripleAliases.append(begin(MIPSELTriples), end(MIPSELTriples)); TripleAliases.append(begin(MIPSTriples), end(MIPSTriples)); BiarchLibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs)); BiarchTripleAliases.append(begin(MIPS64ELTriples), end(MIPS64ELTriples)); break; case llvm::Triple::mips64: LibDirs.append(begin(MIPS64LibDirs), end(MIPS64LibDirs)); TripleAliases.append(begin(MIPS64Triples), end(MIPS64Triples)); BiarchLibDirs.append(begin(MIPSLibDirs), end(MIPSLibDirs)); BiarchTripleAliases.append(begin(MIPSTriples), end(MIPSTriples)); break; case llvm::Triple::mips64el: LibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs)); TripleAliases.append(begin(MIPS64ELTriples), end(MIPS64ELTriples)); BiarchLibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs)); BiarchTripleAliases.append(begin(MIPSELTriples), end(MIPSELTriples)); BiarchTripleAliases.append(begin(MIPSTriples), end(MIPSTriples)); break; case llvm::Triple::ppc: LibDirs.append(begin(PPCLibDirs), end(PPCLibDirs)); TripleAliases.append(begin(PPCTriples), end(PPCTriples)); BiarchLibDirs.append(begin(PPC64LibDirs), end(PPC64LibDirs)); BiarchTripleAliases.append(begin(PPC64Triples), end(PPC64Triples)); break; case llvm::Triple::ppc64: LibDirs.append(begin(PPC64LibDirs), end(PPC64LibDirs)); TripleAliases.append(begin(PPC64Triples), end(PPC64Triples)); BiarchLibDirs.append(begin(PPCLibDirs), end(PPCLibDirs)); BiarchTripleAliases.append(begin(PPCTriples), end(PPCTriples)); break; case llvm::Triple::ppc64le: LibDirs.append(begin(PPC64LELibDirs), end(PPC64LELibDirs)); TripleAliases.append(begin(PPC64LETriples), end(PPC64LETriples)); break; case llvm::Triple::sparc: LibDirs.append(begin(SPARCv8LibDirs), end(SPARCv8LibDirs)); TripleAliases.append(begin(SPARCv8Triples), end(SPARCv8Triples)); BiarchLibDirs.append(begin(SPARCv9LibDirs), end(SPARCv9LibDirs)); BiarchTripleAliases.append(begin(SPARCv9Triples), end(SPARCv9Triples)); break; case llvm::Triple::sparcv9: LibDirs.append(begin(SPARCv9LibDirs), end(SPARCv9LibDirs)); TripleAliases.append(begin(SPARCv9Triples), end(SPARCv9Triples)); BiarchLibDirs.append(begin(SPARCv8LibDirs), end(SPARCv8LibDirs)); BiarchTripleAliases.append(begin(SPARCv8Triples), end(SPARCv8Triples)); break; case llvm::Triple::systemz: LibDirs.append(begin(SystemZLibDirs), end(SystemZLibDirs)); TripleAliases.append(begin(SystemZTriples), end(SystemZTriples)); break; default: // By default, just rely on the standard lib directories and the original // triple. break; } // Always append the drivers target triple to the end, in case it doesn't // match any of our aliases. TripleAliases.push_back(TargetTriple.str()); // Also include the multiarch variant if it's different. if (TargetTriple.str() != BiarchTriple.str()) BiarchTripleAliases.push_back(BiarchTriple.str()); } namespace { // Filter to remove Multilibs that don't exist as a suffix to Path class FilterNonExistent : public MultilibSet::FilterCallback { std::string Base; public: FilterNonExistent(std::string Base) : Base(Base) {} bool operator()(const Multilib &M) const override { return !llvm::sys::fs::exists(Base + M.gccSuffix() + "/crtbegin.o"); } }; } // end anonymous namespace static void addMultilibFlag(bool Enabled, const char *const Flag, std::vector &Flags) { if (Enabled) Flags.push_back(std::string("+") + Flag); else Flags.push_back(std::string("-") + Flag); } static bool isMipsArch(llvm::Triple::ArchType Arch) { return Arch == llvm::Triple::mips || Arch == llvm::Triple::mipsel || Arch == llvm::Triple::mips64 || Arch == llvm::Triple::mips64el; } static bool isMips32(llvm::Triple::ArchType Arch) { return Arch == llvm::Triple::mips || Arch == llvm::Triple::mipsel; } static bool isMips64(llvm::Triple::ArchType Arch) { return Arch == llvm::Triple::mips64 || Arch == llvm::Triple::mips64el; } static bool isMipsEL(llvm::Triple::ArchType Arch) { return Arch == llvm::Triple::mipsel || Arch == llvm::Triple::mips64el; } static bool isMips16(const ArgList &Args) { Arg *A = Args.getLastArg(options::OPT_mips16, options::OPT_mno_mips16); return A && A->getOption().matches(options::OPT_mips16); } static bool isMicroMips(const ArgList &Args) { Arg *A = Args.getLastArg(options::OPT_mmicromips, options::OPT_mno_micromips); return A && A->getOption().matches(options::OPT_mmicromips); } struct DetectedMultilibs { /// The set of multilibs that the detected installation supports. MultilibSet Multilibs; /// The primary multilib appropriate for the given flags. Multilib SelectedMultilib; /// On Biarch systems, this corresponds to the default multilib when /// targeting the non-default multilib. Otherwise, it is empty. llvm::Optional BiarchSibling; }; static Multilib makeMultilib(StringRef commonSuffix) { return Multilib(commonSuffix, commonSuffix, commonSuffix); } static bool findMIPSMultilibs(const llvm::Triple &TargetTriple, StringRef Path, const llvm::opt::ArgList &Args, DetectedMultilibs &Result) { // Some MIPS toolchains put libraries and object files compiled // using different options in to the sub-directoris which names // reflects the flags used for compilation. For example sysroot // directory might looks like the following examples: // // /usr // /lib <= crt*.o files compiled with '-mips32' // /mips16 // /usr // /lib <= crt*.o files compiled with '-mips16' // /el // /usr // /lib <= crt*.o files compiled with '-mips16 -EL' // // or // // /usr // /lib <= crt*.o files compiled with '-mips32r2' // /mips16 // /usr // /lib <= crt*.o files compiled with '-mips32r2 -mips16' // /mips32 // /usr // /lib <= crt*.o files compiled with '-mips32' FilterNonExistent NonExistent(Path); // Check for FSF toolchain multilibs MultilibSet FSFMipsMultilibs; { auto MArchMips32 = makeMultilib("/mips32") .flag("+m32").flag("-m64").flag("-mmicromips").flag("+march=mips32"); auto MArchMicroMips = makeMultilib("/micromips") .flag("+m32").flag("-m64").flag("+mmicromips"); auto MArchMips64r2 = makeMultilib("/mips64r2") .flag("-m32").flag("+m64").flag("+march=mips64r2"); auto MArchMips64 = makeMultilib("/mips64") .flag("-m32").flag("+m64").flag("-march=mips64r2"); auto MArchDefault = makeMultilib("") .flag("+m32").flag("-m64").flag("-mmicromips").flag("+march=mips32r2"); auto Mips16 = makeMultilib("/mips16") .flag("+mips16"); auto UCLibc = makeMultilib("/uclibc") .flag("+muclibc"); auto MAbi64 = makeMultilib("/64") .flag("+mabi=n64").flag("-mabi=n32").flag("-m32"); auto BigEndian = makeMultilib("") .flag("+EB").flag("-EL"); auto LittleEndian = makeMultilib("/el") .flag("+EL").flag("-EB"); auto SoftFloat = makeMultilib("/sof") .flag("+msoft-float"); auto Nan2008 = makeMultilib("/nan2008") .flag("+mnan=2008"); FSFMipsMultilibs = MultilibSet() .Either(MArchMips32, MArchMicroMips, MArchMips64r2, MArchMips64, MArchDefault) .Maybe(UCLibc) .Maybe(Mips16) .FilterOut("/mips64/mips16") .FilterOut("/mips64r2/mips16") .FilterOut("/micromips/mips16") .Maybe(MAbi64) .FilterOut("/micromips/64") .FilterOut("/mips32/64") .FilterOut("^/64") .FilterOut("/mips16/64") .Either(BigEndian, LittleEndian) .Maybe(SoftFloat) .Maybe(Nan2008) .FilterOut(".*sof/nan2008") .FilterOut(NonExistent) .setIncludeDirsCallback([]( StringRef InstallDir, StringRef TripleStr, const Multilib &M) { std::vector Dirs; Dirs.push_back((InstallDir + "/include").str()); std::string SysRootInc = InstallDir.str() + "/../../../../sysroot"; if (StringRef(M.includeSuffix()).startswith("/uclibc")) Dirs.push_back(SysRootInc + "/uclibc/usr/include"); else Dirs.push_back(SysRootInc + "/usr/include"); return Dirs; }); } // Check for Code Sourcery toolchain multilibs MultilibSet CSMipsMultilibs; { auto MArchMips16 = makeMultilib("/mips16") .flag("+m32").flag("+mips16"); auto MArchMicroMips = makeMultilib("/micromips") .flag("+m32").flag("+mmicromips"); auto MArchDefault = makeMultilib("") .flag("-mips16").flag("-mmicromips"); auto UCLibc = makeMultilib("/uclibc") .flag("+muclibc"); auto SoftFloat = makeMultilib("/soft-float") .flag("+msoft-float"); auto Nan2008 = makeMultilib("/nan2008") .flag("+mnan=2008"); auto DefaultFloat = makeMultilib("") .flag("-msoft-float").flag("-mnan=2008"); auto BigEndian = makeMultilib("") .flag("+EB").flag("-EL"); auto LittleEndian = makeMultilib("/el") .flag("+EL").flag("-EB"); // Note that this one's osSuffix is "" auto MAbi64 = makeMultilib("") .gccSuffix("/64") .includeSuffix("/64") .flag("+mabi=n64").flag("-mabi=n32").flag("-m32"); CSMipsMultilibs = MultilibSet() .Either(MArchMips16, MArchMicroMips, MArchDefault) .Maybe(UCLibc) .Either(SoftFloat, Nan2008, DefaultFloat) .FilterOut("/micromips/nan2008") .FilterOut("/mips16/nan2008") .Either(BigEndian, LittleEndian) .Maybe(MAbi64) .FilterOut("/mips16.*/64") .FilterOut("/micromips.*/64") .FilterOut(NonExistent) .setIncludeDirsCallback([]( StringRef InstallDir, StringRef TripleStr, const Multilib &M) { std::vector Dirs; Dirs.push_back((InstallDir + "/include").str()); std::string SysRootInc = InstallDir.str() + "/../../../../" + TripleStr.str(); if (StringRef(M.includeSuffix()).startswith("/uclibc")) Dirs.push_back(SysRootInc + "/libc/uclibc/usr/include"); else Dirs.push_back(SysRootInc + "/libc/usr/include"); return Dirs; }); } MultilibSet AndroidMipsMultilibs = MultilibSet() .Maybe(Multilib("/mips-r2").flag("+march=mips32r2")) .FilterOut(NonExistent); MultilibSet DebianMipsMultilibs; { Multilib MAbiN32 = Multilib() .gccSuffix("/n32") .includeSuffix("/n32") .flag("+mabi=n32"); Multilib M64 = Multilib() .gccSuffix("/64") .includeSuffix("/64") .flag("+m64").flag("-m32").flag("-mabi=n32"); Multilib M32 = Multilib() .flag("-m64").flag("+m32").flag("-mabi=n32"); DebianMipsMultilibs = MultilibSet() .Either(M32, M64, MAbiN32) .FilterOut(NonExistent); } MultilibSet ImgMultilibs; { auto Mips64r6 = makeMultilib("/mips64r6") .flag("+m64").flag("-m32"); auto LittleEndian = makeMultilib("/el") .flag("+EL").flag("-EB"); auto MAbi64 = makeMultilib("/64") .flag("+mabi=n64").flag("-mabi=n32").flag("-m32"); ImgMultilibs = MultilibSet() .Maybe(Mips64r6) .Maybe(MAbi64) .Maybe(LittleEndian) .FilterOut(NonExistent) .setIncludeDirsCallback([]( StringRef InstallDir, StringRef TripleStr, const Multilib &M) { std::vector Dirs; Dirs.push_back((InstallDir + "/include").str()); Dirs.push_back((InstallDir + "/../../../../sysroot/usr/include").str()); return Dirs; }); } StringRef CPUName; StringRef ABIName; tools::mips::getMipsCPUAndABI(Args, TargetTriple, CPUName, ABIName); llvm::Triple::ArchType TargetArch = TargetTriple.getArch(); Multilib::flags_list Flags; addMultilibFlag(isMips32(TargetArch), "m32", Flags); addMultilibFlag(isMips64(TargetArch), "m64", Flags); addMultilibFlag(isMips16(Args), "mips16", Flags); addMultilibFlag(CPUName == "mips32", "march=mips32", Flags); addMultilibFlag(CPUName == "mips32r2", "march=mips32r2", Flags); addMultilibFlag(CPUName == "mips64", "march=mips64", Flags); addMultilibFlag(CPUName == "mips64r2" || CPUName == "octeon", "march=mips64r2", Flags); addMultilibFlag(isMicroMips(Args), "mmicromips", Flags); addMultilibFlag(tools::mips::isUCLibc(Args), "muclibc", Flags); addMultilibFlag(tools::mips::isNaN2008(Args, TargetTriple), "mnan=2008", Flags); addMultilibFlag(ABIName == "n32", "mabi=n32", Flags); addMultilibFlag(ABIName == "n64", "mabi=n64", Flags); addMultilibFlag(isSoftFloatABI(Args), "msoft-float", Flags); addMultilibFlag(!isSoftFloatABI(Args), "mhard-float", Flags); addMultilibFlag(isMipsEL(TargetArch), "EL", Flags); addMultilibFlag(!isMipsEL(TargetArch), "EB", Flags); if (TargetTriple.getEnvironment() == llvm::Triple::Android) { // Select Android toolchain. It's the only choice in that case. if (AndroidMipsMultilibs.select(Flags, Result.SelectedMultilib)) { Result.Multilibs = AndroidMipsMultilibs; return true; } return false; } if (TargetTriple.getVendor() == llvm::Triple::ImaginationTechnologies && TargetTriple.getOS() == llvm::Triple::Linux && TargetTriple.getEnvironment() == llvm::Triple::GNU) { // Select mips-img-linux-gnu toolchain. if (ImgMultilibs.select(Flags, Result.SelectedMultilib)) { Result.Multilibs = ImgMultilibs; return true; } return false; } // Sort candidates. Toolchain that best meets the directories goes first. // Then select the first toolchains matches command line flags. MultilibSet *candidates[] = { &DebianMipsMultilibs, &FSFMipsMultilibs, &CSMipsMultilibs }; std::sort( std::begin(candidates), std::end(candidates), [](MultilibSet *a, MultilibSet *b) { return a->size() > b->size(); }); for (const auto &candidate : candidates) { if (candidate->select(Flags, Result.SelectedMultilib)) { if (candidate == &DebianMipsMultilibs) Result.BiarchSibling = Multilib(); Result.Multilibs = *candidate; return true; } } { // Fallback to the regular toolchain-tree structure. Multilib Default; Result.Multilibs.push_back(Default); Result.Multilibs.FilterOut(NonExistent); if (Result.Multilibs.select(Flags, Result.SelectedMultilib)) { Result.BiarchSibling = Multilib(); return true; } } return false; } static bool findBiarchMultilibs(const llvm::Triple &TargetTriple, StringRef Path, const ArgList &Args, bool NeedsBiarchSuffix, DetectedMultilibs &Result) { // Some versions of SUSE and Fedora on ppc64 put 32-bit libs // in what would normally be GCCInstallPath and put the 64-bit // libs in a subdirectory named 64. The simple logic we follow is that // *if* there is a subdirectory of the right name with crtbegin.o in it, // we use that. If not, and if not a biarch triple alias, we look for // crtbegin.o without the subdirectory. Multilib Default; Multilib Alt64 = Multilib() .gccSuffix("/64") .includeSuffix("/64") .flag("-m32").flag("+m64").flag("-mx32"); Multilib Alt32 = Multilib() .gccSuffix("/32") .includeSuffix("/32") .flag("+m32").flag("-m64").flag("-mx32"); Multilib Altx32 = Multilib() .gccSuffix("/x32") .includeSuffix("/x32") .flag("-m32").flag("-m64").flag("+mx32"); FilterNonExistent NonExistent(Path); // Determine default multilib from: 32, 64, x32 // Also handle cases such as 64 on 32, 32 on 64, etc. enum { UNKNOWN, WANT32, WANT64, WANTX32 } Want = UNKNOWN; const bool IsX32 = TargetTriple.getEnvironment() == llvm::Triple::GNUX32; if (TargetTriple.isArch32Bit() && !NonExistent(Alt32)) Want = WANT64; else if (TargetTriple.isArch64Bit() && IsX32 && !NonExistent(Altx32)) Want = WANT64; else if (TargetTriple.isArch64Bit() && !IsX32 && !NonExistent(Alt64)) Want = WANT32; else { if (TargetTriple.isArch32Bit()) Want = NeedsBiarchSuffix ? WANT64 : WANT32; else if (IsX32) Want = NeedsBiarchSuffix ? WANT64 : WANTX32; else Want = NeedsBiarchSuffix ? WANT32 : WANT64; } if (Want == WANT32) Default.flag("+m32").flag("-m64").flag("-mx32"); else if (Want == WANT64) Default.flag("-m32").flag("+m64").flag("-mx32"); else if (Want == WANTX32) Default.flag("-m32").flag("-m64").flag("+mx32"); else return false; Result.Multilibs.push_back(Default); Result.Multilibs.push_back(Alt64); Result.Multilibs.push_back(Alt32); Result.Multilibs.push_back(Altx32); Result.Multilibs.FilterOut(NonExistent); Multilib::flags_list Flags; addMultilibFlag(TargetTriple.isArch64Bit() && !IsX32, "m64", Flags); addMultilibFlag(TargetTriple.isArch32Bit(), "m32", Flags); addMultilibFlag(TargetTriple.isArch64Bit() && IsX32, "mx32", Flags); if (!Result.Multilibs.select(Flags, Result.SelectedMultilib)) return false; if (Result.SelectedMultilib == Alt64 || Result.SelectedMultilib == Alt32 || Result.SelectedMultilib == Altx32) Result.BiarchSibling = Default; return true; } void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( const llvm::Triple &TargetTriple, const ArgList &Args, const std::string &LibDir, StringRef CandidateTriple, bool NeedsBiarchSuffix) { llvm::Triple::ArchType TargetArch = TargetTriple.getArch(); // There are various different suffixes involving the triple we // check for. We also record what is necessary to walk from each back // up to the lib directory. const std::string LibSuffixes[] = { "/gcc/" + CandidateTriple.str(), // Debian puts cross-compilers in gcc-cross "/gcc-cross/" + CandidateTriple.str(), "/" + CandidateTriple.str() + "/gcc/" + CandidateTriple.str(), // The Freescale PPC SDK has the gcc libraries in // /usr/lib//x.y.z so have a look there as well. "/" + CandidateTriple.str(), // Ubuntu has a strange mis-matched pair of triples that this happens to // match. // FIXME: It may be worthwhile to generalize this and look for a second // triple. "/i386-linux-gnu/gcc/" + CandidateTriple.str() }; const std::string InstallSuffixes[] = { "/../../..", // gcc/ "/../../..", // gcc-cross/ "/../../../..", // /gcc/ "/../..", // / "/../../../.." // i386-linux-gnu/gcc// }; // Only look at the final, weird Ubuntu suffix for i386-linux-gnu. const unsigned NumLibSuffixes = (llvm::array_lengthof(LibSuffixes) - (TargetArch != llvm::Triple::x86)); for (unsigned i = 0; i < NumLibSuffixes; ++i) { StringRef LibSuffix = LibSuffixes[i]; std::error_code EC; for (llvm::sys::fs::directory_iterator LI(LibDir + LibSuffix, EC), LE; !EC && LI != LE; LI = LI.increment(EC)) { StringRef VersionText = llvm::sys::path::filename(LI->path()); GCCVersion CandidateVersion = GCCVersion::Parse(VersionText); if (CandidateVersion.Major != -1) // Filter obviously bad entries. if (!CandidateGCCInstallPaths.insert(LI->path()).second) continue; // Saw this path before; no need to look at it again. if (CandidateVersion.isOlderThan(4, 1, 1)) continue; if (CandidateVersion <= Version) continue; DetectedMultilibs Detected; // Debian mips multilibs behave more like the rest of the biarch ones, // so handle them there if (isMipsArch(TargetArch)) { if (!findMIPSMultilibs(TargetTriple, LI->path(), Args, Detected)) continue; } else if (!findBiarchMultilibs(TargetTriple, LI->path(), Args, NeedsBiarchSuffix, Detected)) { continue; } Multilibs = Detected.Multilibs; SelectedMultilib = Detected.SelectedMultilib; BiarchSibling = Detected.BiarchSibling; Version = CandidateVersion; GCCTriple.setTriple(CandidateTriple); // FIXME: We hack together the directory name here instead of // using LI to ensure stable path separators across Windows and // Linux. GCCInstallPath = LibDir + LibSuffixes[i] + "/" + VersionText.str(); GCCParentLibPath = GCCInstallPath + InstallSuffixes[i]; IsValid = true; } } } Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : ToolChain(D, Triple, Args), GCCInstallation() { getProgramPaths().push_back(getDriver().getInstalledDir()); if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); } Generic_GCC::~Generic_GCC() { } Tool *Generic_GCC::getTool(Action::ActionClass AC) const { switch (AC) { case Action::PreprocessJobClass: if (!Preprocess) Preprocess.reset(new tools::gcc::Preprocess(*this)); return Preprocess.get(); case Action::CompileJobClass: if (!Compile) Compile.reset(new tools::gcc::Compile(*this)); return Compile.get(); default: return ToolChain::getTool(AC); } } Tool *Generic_GCC::buildAssembler() const { return new tools::gnutools::Assemble(*this); } Tool *Generic_GCC::buildLinker() const { return new tools::gcc::Link(*this); } void Generic_GCC::printVerboseInfo(raw_ostream &OS) const { // Print the information about how we detected the GCC installation. GCCInstallation.print(OS); } bool Generic_GCC::IsUnwindTablesDefault() const { return getArch() == llvm::Triple::x86_64; } bool Generic_GCC::isPICDefault() const { return false; } bool Generic_GCC::isPIEDefault() const { return false; } bool Generic_GCC::isPICDefaultForced() const { return false; } bool Generic_GCC::IsIntegratedAssemblerDefault() const { return getTriple().getArch() == llvm::Triple::x86 || getTriple().getArch() == llvm::Triple::x86_64 || getTriple().getArch() == llvm::Triple::aarch64 || getTriple().getArch() == llvm::Triple::aarch64_be || getTriple().getArch() == llvm::Triple::arm || getTriple().getArch() == llvm::Triple::armeb || getTriple().getArch() == llvm::Triple::thumb || getTriple().getArch() == llvm::Triple::thumbeb || getTriple().getArch() == llvm::Triple::ppc || getTriple().getArch() == llvm::Triple::ppc64 || getTriple().getArch() == llvm::Triple::ppc64le || getTriple().getArch() == llvm::Triple::sparc || getTriple().getArch() == llvm::Triple::sparcv9 || getTriple().getArch() == llvm::Triple::systemz; } void Generic_ELF::addClangTargetOptions(const ArgList &DriverArgs, ArgStringList &CC1Args) const { const Generic_GCC::GCCVersion &V = GCCInstallation.getVersion(); bool UseInitArrayDefault = getTriple().getArch() == llvm::Triple::aarch64 || getTriple().getArch() == llvm::Triple::aarch64_be || (getTriple().getOS() == llvm::Triple::Linux && (!V.isOlderThan(4, 7, 0) || getTriple().getEnvironment() == llvm::Triple::Android)); if (DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, UseInitArrayDefault)) CC1Args.push_back("-fuse-init-array"); } /// Hexagon Toolchain std::string Hexagon_TC::GetGnuDir(const std::string &InstalledDir, const ArgList &Args) { // Locate the rest of the toolchain ... std::string GccToolchain = getGCCToolchainDir(Args); if (!GccToolchain.empty()) return GccToolchain; std::string InstallRelDir = InstalledDir + "/../../gnu"; if (llvm::sys::fs::exists(InstallRelDir)) return InstallRelDir; std::string PrefixRelDir = std::string(LLVM_PREFIX) + "/../gnu"; if (llvm::sys::fs::exists(PrefixRelDir)) return PrefixRelDir; return InstallRelDir; } static void GetHexagonLibraryPaths( const ArgList &Args, const std::string &Ver, const std::string &MarchString, const std::string &InstalledDir, ToolChain::path_list *LibPaths) { bool buildingLib = Args.hasArg(options::OPT_shared); //---------------------------------------------------------------------------- // -L Args //---------------------------------------------------------------------------- for (arg_iterator it = Args.filtered_begin(options::OPT_L), ie = Args.filtered_end(); it != ie; ++it) { for (unsigned i = 0, e = (*it)->getNumValues(); i != e; ++i) LibPaths->push_back((*it)->getValue(i)); } //---------------------------------------------------------------------------- // Other standard paths //---------------------------------------------------------------------------- const std::string MarchSuffix = "/" + MarchString; const std::string G0Suffix = "/G0"; const std::string MarchG0Suffix = MarchSuffix + G0Suffix; const std::string RootDir = Hexagon_TC::GetGnuDir(InstalledDir, Args) + "/"; // lib/gcc/hexagon/... std::string LibGCCHexagonDir = RootDir + "lib/gcc/hexagon/"; if (buildingLib) { LibPaths->push_back(LibGCCHexagonDir + Ver + MarchG0Suffix); LibPaths->push_back(LibGCCHexagonDir + Ver + G0Suffix); } LibPaths->push_back(LibGCCHexagonDir + Ver + MarchSuffix); LibPaths->push_back(LibGCCHexagonDir + Ver); // lib/gcc/... LibPaths->push_back(RootDir + "lib/gcc"); // hexagon/lib/... std::string HexagonLibDir = RootDir + "hexagon/lib"; if (buildingLib) { LibPaths->push_back(HexagonLibDir + MarchG0Suffix); LibPaths->push_back(HexagonLibDir + G0Suffix); } LibPaths->push_back(HexagonLibDir + MarchSuffix); LibPaths->push_back(HexagonLibDir); } Hexagon_TC::Hexagon_TC(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : Linux(D, Triple, Args) { const std::string InstalledDir(getDriver().getInstalledDir()); const std::string GnuDir = Hexagon_TC::GetGnuDir(InstalledDir, Args); // Note: Generic_GCC::Generic_GCC adds InstalledDir and getDriver().Dir to // program paths const std::string BinDir(GnuDir + "/bin"); if (llvm::sys::fs::exists(BinDir)) getProgramPaths().push_back(BinDir); // Determine version of GCC libraries and headers to use. const std::string HexagonDir(GnuDir + "/lib/gcc/hexagon"); std::error_code ec; GCCVersion MaxVersion= GCCVersion::Parse("0.0.0"); for (llvm::sys::fs::directory_iterator di(HexagonDir, ec), de; !ec && di != de; di = di.increment(ec)) { GCCVersion cv = GCCVersion::Parse(llvm::sys::path::filename(di->path())); if (MaxVersion < cv) MaxVersion = cv; } GCCLibAndIncVersion = MaxVersion; ToolChain::path_list *LibPaths= &getFilePaths(); // Remove paths added by Linux toolchain. Currently Hexagon_TC really targets // 'elf' OS type, so the Linux paths are not appropriate. When we actually // support 'linux' we'll need to fix this up LibPaths->clear(); GetHexagonLibraryPaths( Args, GetGCCLibAndIncVersion(), GetTargetCPU(Args), InstalledDir, LibPaths); } Hexagon_TC::~Hexagon_TC() { } Tool *Hexagon_TC::buildAssembler() const { return new tools::hexagon::Assemble(*this); } Tool *Hexagon_TC::buildLinker() const { return new tools::hexagon::Link(*this); } void Hexagon_TC::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { const Driver &D = getDriver(); if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc)) return; std::string Ver(GetGCCLibAndIncVersion()); std::string GnuDir = Hexagon_TC::GetGnuDir(D.InstalledDir, DriverArgs); std::string HexagonDir(GnuDir + "/lib/gcc/hexagon/" + Ver); addExternCSystemInclude(DriverArgs, CC1Args, HexagonDir + "/include"); addExternCSystemInclude(DriverArgs, CC1Args, HexagonDir + "/include-fixed"); addExternCSystemInclude(DriverArgs, CC1Args, GnuDir + "/hexagon/include"); } void Hexagon_TC::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; const Driver &D = getDriver(); std::string Ver(GetGCCLibAndIncVersion()); SmallString<128> IncludeDir( Hexagon_TC::GetGnuDir(D.InstalledDir, DriverArgs)); llvm::sys::path::append(IncludeDir, "hexagon/include/c++/"); llvm::sys::path::append(IncludeDir, Ver); addSystemInclude(DriverArgs, CC1Args, IncludeDir.str()); } ToolChain::CXXStdlibType Hexagon_TC::GetCXXStdlibType(const ArgList &Args) const { Arg *A = Args.getLastArg(options::OPT_stdlib_EQ); if (!A) return ToolChain::CST_Libstdcxx; StringRef Value = A->getValue(); if (Value != "libstdc++") { getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args); } return ToolChain::CST_Libstdcxx; } static int getHexagonVersion(const ArgList &Args) { Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ); // Select the default CPU (v4) if none was given. if (!A) return 4; // FIXME: produce errors if we cannot parse the version. StringRef WhichHexagon = A->getValue(); if (WhichHexagon.startswith("hexagonv")) { int Val; if (!WhichHexagon.substr(sizeof("hexagonv") - 1).getAsInteger(10, Val)) return Val; } if (WhichHexagon.startswith("v")) { int Val; if (!WhichHexagon.substr(1).getAsInteger(10, Val)) return Val; } // FIXME: should probably be an error. return 4; } StringRef Hexagon_TC::GetTargetCPU(const ArgList &Args) { int V = getHexagonVersion(Args); // FIXME: We don't support versions < 4. We should error on them. switch (V) { default: llvm_unreachable("Unexpected version"); case 5: return "v5"; case 4: return "v4"; case 3: return "v3"; case 2: return "v2"; case 1: return "v1"; } } // End Hexagon /// TCEToolChain - A tool chain using the llvm bitcode tools to perform /// all subcommands. See http://tce.cs.tut.fi for our peculiar target. /// Currently does not support anything else but compilation. TCEToolChain::TCEToolChain(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : ToolChain(D, Triple, Args) { // Path mangling to find libexec std::string Path(getDriver().Dir); Path += "/../libexec"; getProgramPaths().push_back(Path); } TCEToolChain::~TCEToolChain() { } bool TCEToolChain::IsMathErrnoDefault() const { return true; } bool TCEToolChain::isPICDefault() const { return false; } bool TCEToolChain::isPIEDefault() const { return false; } bool TCEToolChain::isPICDefaultForced() const { return false; } /// OpenBSD - OpenBSD tool chain which can call as(1) and ld(1) directly. OpenBSD::OpenBSD(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { getFilePaths().push_back(getDriver().Dir + "/../lib"); getFilePaths().push_back("/usr/lib"); } Tool *OpenBSD::buildAssembler() const { return new tools::openbsd::Assemble(*this); } Tool *OpenBSD::buildLinker() const { return new tools::openbsd::Link(*this); } /// Bitrig - Bitrig tool chain which can call as(1) and ld(1) directly. Bitrig::Bitrig(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { getFilePaths().push_back(getDriver().Dir + "/../lib"); getFilePaths().push_back("/usr/lib"); } Tool *Bitrig::buildAssembler() const { return new tools::bitrig::Assemble(*this); } Tool *Bitrig::buildLinker() const { return new tools::bitrig::Link(*this); } ToolChain::CXXStdlibType Bitrig::GetCXXStdlibType(const ArgList &Args) const { if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) { StringRef Value = A->getValue(); if (Value == "libstdc++") return ToolChain::CST_Libstdcxx; if (Value == "libc++") return ToolChain::CST_Libcxx; getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args); } return ToolChain::CST_Libcxx; } void Bitrig::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; switch (GetCXXStdlibType(DriverArgs)) { case ToolChain::CST_Libcxx: addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/v1"); break; case ToolChain::CST_Libstdcxx: addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/stdc++"); addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/stdc++/backward"); StringRef Triple = getTriple().str(); if (Triple.startswith("amd64")) addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/stdc++/x86_64" + Triple.substr(5)); else addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/stdc++/" + Triple); break; } } void Bitrig::AddCXXStdlibLibArgs(const ArgList &Args, ArgStringList &CmdArgs) const { switch (GetCXXStdlibType(Args)) { case ToolChain::CST_Libcxx: CmdArgs.push_back("-lc++"); CmdArgs.push_back("-lc++abi"); CmdArgs.push_back("-lpthread"); break; case ToolChain::CST_Libstdcxx: CmdArgs.push_back("-lstdc++"); break; } } /// FreeBSD - FreeBSD tool chain which can call as(1) and ld(1) directly. FreeBSD::FreeBSD(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { // When targeting 32-bit platforms, look for '/usr/lib32/crt1.o' and fall // back to '/usr/lib' if it doesn't exist. if ((Triple.getArch() == llvm::Triple::x86 || Triple.getArch() == llvm::Triple::ppc) && llvm::sys::fs::exists(getDriver().SysRoot + "/usr/lib32/crt1.o")) getFilePaths().push_back(getDriver().SysRoot + "/usr/lib32"); else getFilePaths().push_back(getDriver().SysRoot + "/usr/lib"); } ToolChain::CXXStdlibType FreeBSD::GetCXXStdlibType(const ArgList &Args) const { if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) { StringRef Value = A->getValue(); if (Value == "libstdc++") return ToolChain::CST_Libstdcxx; if (Value == "libc++") return ToolChain::CST_Libcxx; getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args); } if (getTriple().getOSMajorVersion() >= 10) return ToolChain::CST_Libcxx; return ToolChain::CST_Libstdcxx; } void FreeBSD::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; switch (GetCXXStdlibType(DriverArgs)) { case ToolChain::CST_Libcxx: addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/v1"); break; case ToolChain::CST_Libstdcxx: addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/4.2"); addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/4.2/backward"); break; } } Tool *FreeBSD::buildAssembler() const { return new tools::freebsd::Assemble(*this); } Tool *FreeBSD::buildLinker() const { return new tools::freebsd::Link(*this); } bool FreeBSD::UseSjLjExceptions() const { // FreeBSD uses SjLj exceptions on ARM oabi. switch (getTriple().getEnvironment()) { case llvm::Triple::GNUEABIHF: case llvm::Triple::GNUEABI: case llvm::Triple::EABI: return false; default: return (getTriple().getArch() == llvm::Triple::arm || getTriple().getArch() == llvm::Triple::thumb); } } bool FreeBSD::HasNativeLLVMSupport() const { return true; } bool FreeBSD::isPIEDefault() const { return getSanitizerArgs().requiresPIE(); } /// NetBSD - NetBSD tool chain which can call as(1) and ld(1) directly. NetBSD::NetBSD(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { if (getDriver().UseStdLib) { // When targeting a 32-bit platform, try the special directory used on // 64-bit hosts, and only fall back to the main library directory if that // doesn't work. // FIXME: It'd be nicer to test if this directory exists, but I'm not sure // what all logic is needed to emulate the '=' prefix here. switch (Triple.getArch()) { case llvm::Triple::x86: getFilePaths().push_back("=/usr/lib/i386"); break; case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: switch (Triple.getEnvironment()) { case llvm::Triple::EABI: case llvm::Triple::GNUEABI: getFilePaths().push_back("=/usr/lib/eabi"); break; case llvm::Triple::EABIHF: case llvm::Triple::GNUEABIHF: getFilePaths().push_back("=/usr/lib/eabihf"); break; default: getFilePaths().push_back("=/usr/lib/oabi"); break; } break; case llvm::Triple::mips64: case llvm::Triple::mips64el: if (tools::mips::hasMipsAbiArg(Args, "o32")) getFilePaths().push_back("=/usr/lib/o32"); else if (tools::mips::hasMipsAbiArg(Args, "64")) getFilePaths().push_back("=/usr/lib/64"); break; case llvm::Triple::ppc: getFilePaths().push_back("=/usr/lib/powerpc"); break; case llvm::Triple::sparc: getFilePaths().push_back("=/usr/lib/sparc"); break; default: break; } getFilePaths().push_back("=/usr/lib"); } } Tool *NetBSD::buildAssembler() const { return new tools::netbsd::Assemble(*this); } Tool *NetBSD::buildLinker() const { return new tools::netbsd::Link(*this); } ToolChain::CXXStdlibType NetBSD::GetCXXStdlibType(const ArgList &Args) const { if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) { StringRef Value = A->getValue(); if (Value == "libstdc++") return ToolChain::CST_Libstdcxx; if (Value == "libc++") return ToolChain::CST_Libcxx; getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args); } unsigned Major, Minor, Micro; getTriple().getOSVersion(Major, Minor, Micro); if (Major >= 7 || (Major == 6 && Minor == 99 && Micro >= 49) || Major == 0) { switch (getArch()) { case llvm::Triple::aarch64: case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: case llvm::Triple::ppc: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: case llvm::Triple::x86: case llvm::Triple::x86_64: return ToolChain::CST_Libcxx; default: break; } } return ToolChain::CST_Libstdcxx; } void NetBSD::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; switch (GetCXXStdlibType(DriverArgs)) { case ToolChain::CST_Libcxx: addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/c++/"); break; case ToolChain::CST_Libstdcxx: addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/g++"); addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/usr/include/g++/backward"); break; } } /// Minix - Minix tool chain which can call as(1) and ld(1) directly. Minix::Minix(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { getFilePaths().push_back(getDriver().Dir + "/../lib"); getFilePaths().push_back("/usr/lib"); } Tool *Minix::buildAssembler() const { return new tools::minix::Assemble(*this); } Tool *Minix::buildLinker() const { return new tools::minix::Link(*this); } /// Solaris - Solaris tool chain which can call as(1) and ld(1) directly. Solaris::Solaris(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_GCC(D, Triple, Args) { getProgramPaths().push_back(getDriver().getInstalledDir()); if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); getFilePaths().push_back(getDriver().Dir + "/../lib"); getFilePaths().push_back("/usr/lib"); } Tool *Solaris::buildAssembler() const { return new tools::solaris::Assemble(*this); } Tool *Solaris::buildLinker() const { return new tools::solaris::Link(*this); } /// Distribution (very bare-bones at the moment). enum Distro { ArchLinux, DebianLenny, DebianSqueeze, DebianWheezy, DebianJessie, Exherbo, RHEL4, RHEL5, RHEL6, Fedora, OpenSUSE, UbuntuHardy, UbuntuIntrepid, UbuntuJaunty, UbuntuKarmic, UbuntuLucid, UbuntuMaverick, UbuntuNatty, UbuntuOneiric, UbuntuPrecise, UbuntuQuantal, UbuntuRaring, UbuntuSaucy, UbuntuTrusty, UnknownDistro }; static bool IsRedhat(enum Distro Distro) { return Distro == Fedora || (Distro >= RHEL4 && Distro <= RHEL6); } static bool IsOpenSUSE(enum Distro Distro) { return Distro == OpenSUSE; } static bool IsDebian(enum Distro Distro) { return Distro >= DebianLenny && Distro <= DebianJessie; } static bool IsUbuntu(enum Distro Distro) { return Distro >= UbuntuHardy && Distro <= UbuntuTrusty; } static Distro DetectDistro(llvm::Triple::ArchType Arch) { llvm::ErrorOr> File = llvm::MemoryBuffer::getFile("/etc/lsb-release"); if (File) { StringRef Data = File.get()->getBuffer(); SmallVector Lines; Data.split(Lines, "\n"); Distro Version = UnknownDistro; for (unsigned i = 0, s = Lines.size(); i != s; ++i) if (Version == UnknownDistro && Lines[i].startswith("DISTRIB_CODENAME=")) Version = llvm::StringSwitch(Lines[i].substr(17)) .Case("hardy", UbuntuHardy) .Case("intrepid", UbuntuIntrepid) .Case("jaunty", UbuntuJaunty) .Case("karmic", UbuntuKarmic) .Case("lucid", UbuntuLucid) .Case("maverick", UbuntuMaverick) .Case("natty", UbuntuNatty) .Case("oneiric", UbuntuOneiric) .Case("precise", UbuntuPrecise) .Case("quantal", UbuntuQuantal) .Case("raring", UbuntuRaring) .Case("saucy", UbuntuSaucy) .Case("trusty", UbuntuTrusty) .Default(UnknownDistro); return Version; } File = llvm::MemoryBuffer::getFile("/etc/redhat-release"); if (File) { StringRef Data = File.get()->getBuffer(); if (Data.startswith("Fedora release")) return Fedora; if (Data.startswith("Red Hat Enterprise Linux") || Data.startswith("CentOS")) { if (Data.find("release 6") != StringRef::npos) return RHEL6; else if (Data.find("release 5") != StringRef::npos) return RHEL5; else if (Data.find("release 4") != StringRef::npos) return RHEL4; } return UnknownDistro; } File = llvm::MemoryBuffer::getFile("/etc/debian_version"); if (File) { StringRef Data = File.get()->getBuffer(); if (Data[0] == '5') return DebianLenny; else if (Data.startswith("squeeze/sid") || Data[0] == '6') return DebianSqueeze; else if (Data.startswith("wheezy/sid") || Data[0] == '7') return DebianWheezy; else if (Data.startswith("jessie/sid") || Data[0] == '8') return DebianJessie; return UnknownDistro; } if (llvm::sys::fs::exists("/etc/SuSE-release")) return OpenSUSE; if (llvm::sys::fs::exists("/etc/exherbo-release")) return Exherbo; if (llvm::sys::fs::exists("/etc/arch-release")) return ArchLinux; return UnknownDistro; } /// \brief Get our best guess at the multiarch triple for a target. /// /// Debian-based systems are starting to use a multiarch setup where they use /// a target-triple directory in the library and header search paths. /// Unfortunately, this triple does not align with the vanilla target triple, /// so we provide a rough mapping here. static std::string getMultiarchTriple(const llvm::Triple &TargetTriple, StringRef SysRoot) { // For most architectures, just use whatever we have rather than trying to be // clever. switch (TargetTriple.getArch()) { default: return TargetTriple.str(); // We use the existence of '/lib/' as a directory to detect some // common linux triples that don't quite match the Clang triple for both // 32-bit and 64-bit targets. Multiarch fixes its install triples to these // regardless of what the actual target triple is. case llvm::Triple::arm: case llvm::Triple::thumb: if (TargetTriple.getEnvironment() == llvm::Triple::GNUEABIHF) { if (llvm::sys::fs::exists(SysRoot + "/lib/arm-linux-gnueabihf")) return "arm-linux-gnueabihf"; } else { if (llvm::sys::fs::exists(SysRoot + "/lib/arm-linux-gnueabi")) return "arm-linux-gnueabi"; } return TargetTriple.str(); case llvm::Triple::armeb: case llvm::Triple::thumbeb: if (TargetTriple.getEnvironment() == llvm::Triple::GNUEABIHF) { if (llvm::sys::fs::exists(SysRoot + "/lib/armeb-linux-gnueabihf")) return "armeb-linux-gnueabihf"; } else { if (llvm::sys::fs::exists(SysRoot + "/lib/armeb-linux-gnueabi")) return "armeb-linux-gnueabi"; } return TargetTriple.str(); case llvm::Triple::x86: if (llvm::sys::fs::exists(SysRoot + "/lib/i386-linux-gnu")) return "i386-linux-gnu"; return TargetTriple.str(); case llvm::Triple::x86_64: // We don't want this for x32, otherwise it will match x86_64 libs if (TargetTriple.getEnvironment() != llvm::Triple::GNUX32 && llvm::sys::fs::exists(SysRoot + "/lib/x86_64-linux-gnu")) return "x86_64-linux-gnu"; return TargetTriple.str(); case llvm::Triple::aarch64: if (llvm::sys::fs::exists(SysRoot + "/lib/aarch64-linux-gnu")) return "aarch64-linux-gnu"; return TargetTriple.str(); case llvm::Triple::aarch64_be: if (llvm::sys::fs::exists(SysRoot + "/lib/aarch64_be-linux-gnu")) return "aarch64_be-linux-gnu"; return TargetTriple.str(); case llvm::Triple::mips: if (llvm::sys::fs::exists(SysRoot + "/lib/mips-linux-gnu")) return "mips-linux-gnu"; return TargetTriple.str(); case llvm::Triple::mipsel: if (llvm::sys::fs::exists(SysRoot + "/lib/mipsel-linux-gnu")) return "mipsel-linux-gnu"; return TargetTriple.str(); case llvm::Triple::mips64: if (llvm::sys::fs::exists(SysRoot + "/lib/mips64-linux-gnu")) return "mips64-linux-gnu"; if (llvm::sys::fs::exists(SysRoot + "/lib/mips64-linux-gnuabi64")) return "mips64-linux-gnuabi64"; return TargetTriple.str(); case llvm::Triple::mips64el: if (llvm::sys::fs::exists(SysRoot + "/lib/mips64el-linux-gnu")) return "mips64el-linux-gnu"; if (llvm::sys::fs::exists(SysRoot + "/lib/mips64el-linux-gnuabi64")) return "mips64el-linux-gnuabi64"; return TargetTriple.str(); case llvm::Triple::ppc: if (llvm::sys::fs::exists(SysRoot + "/lib/powerpc-linux-gnuspe")) return "powerpc-linux-gnuspe"; if (llvm::sys::fs::exists(SysRoot + "/lib/powerpc-linux-gnu")) return "powerpc-linux-gnu"; return TargetTriple.str(); case llvm::Triple::ppc64: if (llvm::sys::fs::exists(SysRoot + "/lib/powerpc64-linux-gnu")) return "powerpc64-linux-gnu"; case llvm::Triple::ppc64le: if (llvm::sys::fs::exists(SysRoot + "/lib/powerpc64le-linux-gnu")) return "powerpc64le-linux-gnu"; return TargetTriple.str(); } } static void addPathIfExists(Twine Path, ToolChain::path_list &Paths) { if (llvm::sys::fs::exists(Path)) Paths.push_back(Path.str()); } static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) { if (isMipsArch(Triple.getArch())) { // lib32 directory has a special meaning on MIPS targets. // It contains N32 ABI binaries. Use this folder if produce // code for N32 ABI only. if (tools::mips::hasMipsAbiArg(Args, "n32")) return "lib32"; return Triple.isArch32Bit() ? "lib" : "lib64"; } // It happens that only x86 and PPC use the 'lib32' variant of oslibdir, and // using that variant while targeting other architectures causes problems // because the libraries are laid out in shared system roots that can't cope // with a 'lib32' library search path being considered. So we only enable // them when we know we may need it. // // FIXME: This is a bit of a hack. We should really unify this code for // reasoning about oslibdir spellings with the lib dir spellings in the // GCCInstallationDetector, but that is a more significant refactoring. if (Triple.getArch() == llvm::Triple::x86 || Triple.getArch() == llvm::Triple::ppc) return "lib32"; if (Triple.getArch() == llvm::Triple::x86_64 && Triple.getEnvironment() == llvm::Triple::GNUX32) return "libx32"; return Triple.isArch32Bit() ? "lib" : "lib64"; } Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { GCCInstallation.init(D, Triple, Args); Multilibs = GCCInstallation.getMultilibs(); llvm::Triple::ArchType Arch = Triple.getArch(); std::string SysRoot = computeSysRoot(); // Cross-compiling binutils and GCC installations (vanilla and openSUSE at // least) put various tools in a triple-prefixed directory off of the parent // of the GCC installation. We use the GCC triple here to ensure that we end // up with tools that support the same amount of cross compiling as the // detected GCC installation. For example, if we find a GCC installation // targeting x86_64, but it is a bi-arch GCC installation, it can also be // used to target i386. // FIXME: This seems unlikely to be Linux-specific. ToolChain::path_list &PPaths = getProgramPaths(); PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" + GCCInstallation.getTriple().str() + "/bin").str()); Linker = GetLinkerPath(); Distro Distro = DetectDistro(Arch); if (IsOpenSUSE(Distro) || IsUbuntu(Distro)) { ExtraOpts.push_back("-z"); ExtraOpts.push_back("relro"); } if (Arch == llvm::Triple::arm || Arch == llvm::Triple::thumb) ExtraOpts.push_back("-X"); const bool IsAndroid = Triple.getEnvironment() == llvm::Triple::Android; const bool IsMips = isMipsArch(Arch); if (IsMips && !SysRoot.empty()) ExtraOpts.push_back("--sysroot=" + SysRoot); // Do not use 'gnu' hash style for Mips targets because .gnu.hash // and the MIPS ABI require .dynsym to be sorted in different ways. // .gnu.hash needs symbols to be grouped by hash code whereas the MIPS // ABI requires a mapping between the GOT and the symbol table. // Android loader does not support .gnu.hash. if (!IsMips && !IsAndroid) { if (IsRedhat(Distro) || IsOpenSUSE(Distro) || (IsUbuntu(Distro) && Distro >= UbuntuMaverick)) ExtraOpts.push_back("--hash-style=gnu"); if (IsDebian(Distro) || IsOpenSUSE(Distro) || Distro == UbuntuLucid || Distro == UbuntuJaunty || Distro == UbuntuKarmic) ExtraOpts.push_back("--hash-style=both"); } if (IsRedhat(Distro)) ExtraOpts.push_back("--no-add-needed"); if (Distro == DebianSqueeze || Distro == DebianWheezy || Distro == DebianJessie || IsOpenSUSE(Distro) || (IsRedhat(Distro) && Distro != RHEL4 && Distro != RHEL5) || (IsUbuntu(Distro) && Distro >= UbuntuKarmic)) ExtraOpts.push_back("--build-id"); if (IsOpenSUSE(Distro)) ExtraOpts.push_back("--enable-new-dtags"); // The selection of paths to try here is designed to match the patterns which // the GCC driver itself uses, as this is part of the GCC-compatible driver. // This was determined by running GCC in a fake filesystem, creating all // possible permutations of these directories, and seeing which ones it added // to the link paths. path_list &Paths = getFilePaths(); const std::string OSLibDir = getOSLibDir(Triple, Args); const std::string MultiarchTriple = getMultiarchTriple(Triple, SysRoot); // Add the multilib suffixed paths where they are available. if (GCCInstallation.isValid()) { const llvm::Triple &GCCTriple = GCCInstallation.getTriple(); const std::string &LibPath = GCCInstallation.getParentLibPath(); const Multilib &Multilib = GCCInstallation.getMultilib(); // Sourcery CodeBench MIPS toolchain holds some libraries under // a biarch-like suffix of the GCC installation. addPathIfExists((GCCInstallation.getInstallPath() + Multilib.gccSuffix()), Paths); // GCC cross compiling toolchains will install target libraries which ship // as part of the toolchain under // rather than as // any part of the GCC installation in // //gcc//. This decision is somewhat // debatable, but is the reality today. We need to search this tree even // when we have a sysroot somewhere else. It is the responsibility of // whomever is doing the cross build targeting a sysroot using a GCC // installation that is *not* within the system root to ensure two things: // // 1) Any DSOs that are linked in from this tree or from the install path // above must be present on the system root and found via an // appropriate rpath. // 2) There must not be libraries installed into // // unless they should be preferred over // those within the system root. // // Note that this matches the GCC behavior. See the below comment for where // Clang diverges from GCC's behavior. addPathIfExists(LibPath + "/../" + GCCTriple.str() + "/lib/../" + OSLibDir + Multilib.osSuffix(), Paths); // If the GCC installation we found is inside of the sysroot, we want to // prefer libraries installed in the parent prefix of the GCC installation. // It is important to *not* use these paths when the GCC installation is // outside of the system root as that can pick up unintended libraries. // This usually happens when there is an external cross compiler on the // host system, and a more minimal sysroot available that is the target of // the cross. Note that GCC does include some of these directories in some // configurations but this seems somewhere between questionable and simply // a bug. if (StringRef(LibPath).startswith(SysRoot)) { addPathIfExists(LibPath + "/" + MultiarchTriple, Paths); addPathIfExists(LibPath + "/../" + OSLibDir, Paths); } } // Similar to the logic for GCC above, if we currently running Clang inside // of the requested system root, add its parent library paths to // those searched. // FIXME: It's not clear whether we should use the driver's installed // directory ('Dir' below) or the ResourceDir. if (StringRef(D.Dir).startswith(SysRoot)) { addPathIfExists(D.Dir + "/../lib/" + MultiarchTriple, Paths); addPathIfExists(D.Dir + "/../" + OSLibDir, Paths); } addPathIfExists(SysRoot + "/lib/" + MultiarchTriple, Paths); addPathIfExists(SysRoot + "/lib/../" + OSLibDir, Paths); addPathIfExists(SysRoot + "/usr/lib/" + MultiarchTriple, Paths); addPathIfExists(SysRoot + "/usr/lib/../" + OSLibDir, Paths); // Try walking via the GCC triple path in case of biarch or multiarch GCC // installations with strange symlinks. if (GCCInstallation.isValid()) { addPathIfExists(SysRoot + "/usr/lib/" + GCCInstallation.getTriple().str() + "/../../" + OSLibDir, Paths); // Add the 'other' biarch variant path Multilib BiarchSibling; if (GCCInstallation.getBiarchSibling(BiarchSibling)) { addPathIfExists(GCCInstallation.getInstallPath() + BiarchSibling.gccSuffix(), Paths); } // See comments above on the multilib variant for details of why this is // included even from outside the sysroot. const std::string &LibPath = GCCInstallation.getParentLibPath(); const llvm::Triple &GCCTriple = GCCInstallation.getTriple(); const Multilib &Multilib = GCCInstallation.getMultilib(); addPathIfExists(LibPath + "/../" + GCCTriple.str() + "/lib" + Multilib.osSuffix(), Paths); // See comments above on the multilib variant for details of why this is // only included from within the sysroot. if (StringRef(LibPath).startswith(SysRoot)) addPathIfExists(LibPath, Paths); } // Similar to the logic for GCC above, if we are currently running Clang // inside of the requested system root, add its parent library path to those // searched. // FIXME: It's not clear whether we should use the driver's installed // directory ('Dir' below) or the ResourceDir. if (StringRef(D.Dir).startswith(SysRoot)) addPathIfExists(D.Dir + "/../lib", Paths); addPathIfExists(SysRoot + "/lib", Paths); addPathIfExists(SysRoot + "/usr/lib", Paths); } bool Linux::HasNativeLLVMSupport() const { return true; } Tool *Linux::buildLinker() const { return new tools::gnutools::Link(*this); } Tool *Linux::buildAssembler() const { return new tools::gnutools::Assemble(*this); } std::string Linux::computeSysRoot() const { if (!getDriver().SysRoot.empty()) return getDriver().SysRoot; if (!GCCInstallation.isValid() || !isMipsArch(getTriple().getArch())) return std::string(); // Standalone MIPS toolchains use different names for sysroot folder // and put it into different places. Here we try to check some known // variants. const StringRef InstallDir = GCCInstallation.getInstallPath(); const StringRef TripleStr = GCCInstallation.getTriple().str(); const Multilib &Multilib = GCCInstallation.getMultilib(); std::string Path = (InstallDir + "/../../../../" + TripleStr + "/libc" + Multilib.osSuffix()).str(); if (llvm::sys::fs::exists(Path)) return Path; Path = (InstallDir + "/../../../../sysroot" + Multilib.osSuffix()).str(); if (llvm::sys::fs::exists(Path)) return Path; return std::string(); } void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include"); if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { SmallString<128> P(D.ResourceDir); llvm::sys::path::append(P, "include"); addSystemInclude(DriverArgs, CC1Args, P.str()); } if (DriverArgs.hasArg(options::OPT_nostdlibinc)) return; // Check for configure-time C include directories. StringRef CIncludeDirs(C_INCLUDE_DIRS); if (CIncludeDirs != "") { SmallVector dirs; CIncludeDirs.split(dirs, ":"); for (StringRef dir : dirs) { - StringRef Prefix = llvm::sys::path::is_absolute(dir) ? SysRoot : ""; + StringRef Prefix = + llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : ""; addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir); } return; } // Lacking those, try to detect the correct set of system includes for the // target triple. // Add include directories specific to the selected multilib set and multilib. if (GCCInstallation.isValid()) { auto Callback = Multilibs.includeDirsCallback(); if (Callback) { const auto IncludePaths = Callback(GCCInstallation.getInstallPath(), GCCInstallation.getTriple().str(), GCCInstallation.getMultilib()); for (const auto &Path : IncludePaths) addExternCSystemIncludeIfExists(DriverArgs, CC1Args, Path); } } // Implement generic Debian multiarch support. const StringRef X86_64MultiarchIncludeDirs[] = { "/usr/include/x86_64-linux-gnu", // FIXME: These are older forms of multiarch. It's not clear that they're // in use in any released version of Debian, so we should consider // removing them. "/usr/include/i686-linux-gnu/64", "/usr/include/i486-linux-gnu/64" }; const StringRef X86MultiarchIncludeDirs[] = { "/usr/include/i386-linux-gnu", // FIXME: These are older forms of multiarch. It's not clear that they're // in use in any released version of Debian, so we should consider // removing them. "/usr/include/x86_64-linux-gnu/32", "/usr/include/i686-linux-gnu", "/usr/include/i486-linux-gnu" }; const StringRef AArch64MultiarchIncludeDirs[] = { "/usr/include/aarch64-linux-gnu" }; const StringRef ARMMultiarchIncludeDirs[] = { "/usr/include/arm-linux-gnueabi" }; const StringRef ARMHFMultiarchIncludeDirs[] = { "/usr/include/arm-linux-gnueabihf" }; const StringRef MIPSMultiarchIncludeDirs[] = { "/usr/include/mips-linux-gnu" }; const StringRef MIPSELMultiarchIncludeDirs[] = { "/usr/include/mipsel-linux-gnu" }; const StringRef MIPS64MultiarchIncludeDirs[] = { "/usr/include/mips64-linux-gnu", "/usr/include/mips64-linux-gnuabi64" }; const StringRef MIPS64ELMultiarchIncludeDirs[] = { "/usr/include/mips64el-linux-gnu", "/usr/include/mips64el-linux-gnuabi64" }; const StringRef PPCMultiarchIncludeDirs[] = { "/usr/include/powerpc-linux-gnu" }; const StringRef PPC64MultiarchIncludeDirs[] = { "/usr/include/powerpc64-linux-gnu" }; const StringRef PPC64LEMultiarchIncludeDirs[] = { "/usr/include/powerpc64le-linux-gnu" }; ArrayRef MultiarchIncludeDirs; if (getTriple().getArch() == llvm::Triple::x86_64) { MultiarchIncludeDirs = X86_64MultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::x86) { MultiarchIncludeDirs = X86MultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::aarch64 || getTriple().getArch() == llvm::Triple::aarch64_be) { MultiarchIncludeDirs = AArch64MultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::arm) { if (getTriple().getEnvironment() == llvm::Triple::GNUEABIHF) MultiarchIncludeDirs = ARMHFMultiarchIncludeDirs; else MultiarchIncludeDirs = ARMMultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::mips) { MultiarchIncludeDirs = MIPSMultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::mipsel) { MultiarchIncludeDirs = MIPSELMultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::mips64) { MultiarchIncludeDirs = MIPS64MultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::mips64el) { MultiarchIncludeDirs = MIPS64ELMultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::ppc) { MultiarchIncludeDirs = PPCMultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::ppc64) { MultiarchIncludeDirs = PPC64MultiarchIncludeDirs; } else if (getTriple().getArch() == llvm::Triple::ppc64le) { MultiarchIncludeDirs = PPC64LEMultiarchIncludeDirs; } for (StringRef Dir : MultiarchIncludeDirs) { if (llvm::sys::fs::exists(SysRoot + Dir)) { addExternCSystemInclude(DriverArgs, CC1Args, SysRoot + Dir); break; } } if (getTriple().getOS() == llvm::Triple::RTEMS) return; // Add an include of '/include' directly. This isn't provided by default by // system GCCs, but is often used with cross-compiling GCCs, and harmless to // add even when Clang is acting as-if it were a system compiler. addExternCSystemInclude(DriverArgs, CC1Args, SysRoot + "/include"); addExternCSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/include"); } /// \brief Helper to add the variant paths of a libstdc++ installation. /*static*/ bool Linux::addLibStdCXXIncludePaths(Twine Base, Twine Suffix, StringRef GCCTriple, StringRef GCCMultiarchTriple, StringRef TargetMultiarchTriple, Twine IncludeSuffix, const ArgList &DriverArgs, ArgStringList &CC1Args) { if (!llvm::sys::fs::exists(Base + Suffix)) return false; addSystemInclude(DriverArgs, CC1Args, Base + Suffix); // The vanilla GCC layout of libstdc++ headers uses a triple subdirectory. If // that path exists or we have neither a GCC nor target multiarch triple, use // this vanilla search path. if ((GCCMultiarchTriple.empty() && TargetMultiarchTriple.empty()) || llvm::sys::fs::exists(Base + Suffix + "/" + GCCTriple + IncludeSuffix)) { addSystemInclude(DriverArgs, CC1Args, Base + Suffix + "/" + GCCTriple + IncludeSuffix); } else { // Otherwise try to use multiarch naming schemes which have normalized the // triples and put the triple before the suffix. // // GCC surprisingly uses *both* the GCC triple with a multilib suffix and // the target triple, so we support that here. addSystemInclude(DriverArgs, CC1Args, Base + "/" + GCCMultiarchTriple + Suffix + IncludeSuffix); addSystemInclude(DriverArgs, CC1Args, Base + "/" + TargetMultiarchTriple + Suffix); } addSystemInclude(DriverArgs, CC1Args, Base + Suffix + "/backward"); return true; } void Linux::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; // Check if libc++ has been enabled and provide its include paths if so. if (GetCXXStdlibType(DriverArgs) == ToolChain::CST_Libcxx) { const std::string LibCXXIncludePathCandidates[] = { // The primary location is within the Clang installation. // FIXME: We shouldn't hard code 'v1' here to make Clang future proof to // newer ABI versions. getDriver().Dir + "/../include/c++/v1", // We also check the system as for a long time this is the only place Clang looked. // FIXME: We should really remove this. It doesn't make any sense. getDriver().SysRoot + "/usr/include/c++/v1" }; for (const auto &IncludePath : LibCXXIncludePathCandidates) { if (!llvm::sys::fs::exists(IncludePath)) continue; // Add the first candidate that exists. addSystemInclude(DriverArgs, CC1Args, IncludePath); break; } return; } // We need a detected GCC installation on Linux to provide libstdc++'s // headers. We handled the libc++ case above. if (!GCCInstallation.isValid()) return; // By default, look for the C++ headers in an include directory adjacent to // the lib directory of the GCC installation. Note that this is expect to be // equivalent to '/usr/include/c++/X.Y' in almost all cases. StringRef LibDir = GCCInstallation.getParentLibPath(); StringRef InstallDir = GCCInstallation.getInstallPath(); StringRef TripleStr = GCCInstallation.getTriple().str(); const Multilib &Multilib = GCCInstallation.getMultilib(); const std::string GCCMultiarchTriple = getMultiarchTriple(GCCInstallation.getTriple(), getDriver().SysRoot); const std::string TargetMultiarchTriple = getMultiarchTriple(getTriple(), getDriver().SysRoot); const GCCVersion &Version = GCCInstallation.getVersion(); // The primary search for libstdc++ supports multiarch variants. if (addLibStdCXXIncludePaths(LibDir.str() + "/../include", "/c++/" + Version.Text, TripleStr, GCCMultiarchTriple, TargetMultiarchTriple, Multilib.includeSuffix(), DriverArgs, CC1Args)) return; // Otherwise, fall back on a bunch of options which don't use multiarch // layouts for simplicity. const std::string LibStdCXXIncludePathCandidates[] = { // Gentoo is weird and places its headers inside the GCC install, so if the // first attempt to find the headers fails, try these patterns. InstallDir.str() + "/include/g++-v" + Version.MajorStr + "." + Version.MinorStr, InstallDir.str() + "/include/g++-v" + Version.MajorStr, // Android standalone toolchain has C++ headers in yet another place. LibDir.str() + "/../" + TripleStr.str() + "/include/c++/" + Version.Text, // Freescale SDK C++ headers are directly in /usr/include/c++, // without a subdirectory corresponding to the gcc version. LibDir.str() + "/../include/c++", }; for (const auto &IncludePath : LibStdCXXIncludePathCandidates) { if (addLibStdCXXIncludePaths(IncludePath, /*Suffix*/ "", TripleStr, /*GCCMultiarchTriple*/ "", /*TargetMultiarchTriple*/ "", Multilib.includeSuffix(), DriverArgs, CC1Args)) break; } } bool Linux::isPIEDefault() const { return getSanitizerArgs().requiresPIE(); } /// DragonFly - DragonFly tool chain which can call as(1) and ld(1) directly. DragonFly::DragonFly(const Driver &D, const llvm::Triple& Triple, const ArgList &Args) : Generic_ELF(D, Triple, Args) { // Path mangling to find libexec getProgramPaths().push_back(getDriver().getInstalledDir()); if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); getFilePaths().push_back(getDriver().Dir + "/../lib"); getFilePaths().push_back("/usr/lib"); if (llvm::sys::fs::exists("/usr/lib/gcc47")) getFilePaths().push_back("/usr/lib/gcc47"); else getFilePaths().push_back("/usr/lib/gcc44"); } Tool *DragonFly::buildAssembler() const { return new tools::dragonfly::Assemble(*this); } Tool *DragonFly::buildLinker() const { return new tools::dragonfly::Link(*this); } /// XCore tool chain XCore::XCore(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : ToolChain(D, Triple, Args) { // ProgramPaths are found via 'PATH' environment variable. } Tool *XCore::buildAssembler() const { return new tools::XCore::Assemble(*this); } Tool *XCore::buildLinker() const { return new tools::XCore::Link(*this); } bool XCore::isPICDefault() const { return false; } bool XCore::isPIEDefault() const { return false; } bool XCore::isPICDefaultForced() const { return false; } bool XCore::SupportsProfiling() const { return false; } bool XCore::hasBlocksRuntime() const { return false; } void XCore::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc)) return; if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) { SmallVector Dirs; const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator,'\0'}; StringRef(cl_include_dir).split(Dirs, StringRef(EnvPathSeparatorStr)); ArrayRef DirVec(Dirs); addSystemIncludes(DriverArgs, CC1Args, DirVec); } } void XCore::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const { CC1Args.push_back("-nostdsysteminc"); } void XCore::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; if (const char *cl_include_dir = getenv("XCC_CPLUS_INCLUDE_PATH")) { SmallVector Dirs; const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator,'\0'}; StringRef(cl_include_dir).split(Dirs, StringRef(EnvPathSeparatorStr)); ArrayRef DirVec(Dirs); addSystemIncludes(DriverArgs, CC1Args, DirVec); } } void XCore::AddCXXStdlibLibArgs(const ArgList &Args, ArgStringList &CmdArgs) const { // We don't output any lib args. This is handled by xcc. } Index: projects/clang360-import/contrib/llvm/tools/clang =================================================================== --- projects/clang360-import/contrib/llvm/tools/clang (revision 279024) +++ projects/clang360-import/contrib/llvm/tools/clang (revision 279025) Property changes on: projects/clang360-import/contrib/llvm/tools/clang ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/clang/dist:r278756-279023 Index: projects/clang360-import/contrib/llvm =================================================================== --- projects/clang360-import/contrib/llvm (revision 279024) +++ projects/clang360-import/contrib/llvm (revision 279025) Property changes on: projects/clang360-import/contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/llvm/dist:r278756-279023 Index: projects/clang360-import/lib/clang/include/clang/Basic/Version.inc =================================================================== --- projects/clang360-import/lib/clang/include/clang/Basic/Version.inc (revision 279024) +++ projects/clang360-import/lib/clang/include/clang/Basic/Version.inc (revision 279025) @@ -1,11 +1,11 @@ /* $FreeBSD$ */ #define CLANG_VERSION 3.6.0 #define CLANG_VERSION_MAJOR 3 #define CLANG_VERSION_MINOR 6 #define CLANG_VERSION_PATCHLEVEL 0 #define CLANG_VENDOR "FreeBSD " -#define CLANG_VENDOR_SUFFIX " 20150214" +#define CLANG_VENDOR_SUFFIX " 20150219" -#define SVN_REVISION "229040" +#define SVN_REVISION "229772" Index: projects/clang360-import/lib/clang/libllvmexecutionengine/Makefile =================================================================== --- projects/clang360-import/lib/clang/libllvmexecutionengine/Makefile (revision 279024) +++ projects/clang360-import/lib/clang/libllvmexecutionengine/Makefile (revision 279025) @@ -1,14 +1,13 @@ # $FreeBSD$ .include LIB= llvmexecutionengine SRCDIR= lib/ExecutionEngine SRCS= ExecutionEngine.cpp \ ExecutionEngineBindings.cpp \ GDBRegistrationListener.cpp \ - RTDyldMemoryManager.cpp \ TargetSelect.cpp .include "../clang.lib.mk" Index: projects/clang360-import/lib/clang/libllvmruntimedyld/Makefile =================================================================== --- projects/clang360-import/lib/clang/libllvmruntimedyld/Makefile (revision 279024) +++ projects/clang360-import/lib/clang/libllvmruntimedyld/Makefile (revision 279025) @@ -1,13 +1,14 @@ # $FreeBSD$ .include LIB= llvmruntimedyld SRCDIR= lib/ExecutionEngine/RuntimeDyld -SRCS= RuntimeDyld.cpp \ +SRCS= RTDyldMemoryManager.cpp \ + RuntimeDyld.cpp \ RuntimeDyldChecker.cpp \ RuntimeDyldELF.cpp \ RuntimeDyldMachO.cpp .include "../clang.lib.mk"