Index: projects/clang380-import/contrib/llvm/include/llvm/CodeGen/MachineFunction.h =================================================================== --- projects/clang380-import/contrib/llvm/include/llvm/CodeGen/MachineFunction.h (revision 294608) +++ projects/clang380-import/contrib/llvm/include/llvm/CodeGen/MachineFunction.h (revision 294609) @@ -1,585 +1,585 @@ //===-- llvm/CodeGen/MachineFunction.h --------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Collect native machine code for a function. This class contains a list of // MachineBasicBlock instances that make up the current compiled function. // // This class also contains pointers to various classes which hold // target-specific information about the generated code. // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_MACHINEFUNCTION_H #define LLVM_CODEGEN_MACHINEFUNCTION_H #include "llvm/ADT/ilist.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" #include "llvm/Support/Recycler.h" namespace llvm { class Value; class Function; class GCModuleInfo; class MachineRegisterInfo; class MachineFrameInfo; class MachineConstantPool; class MachineJumpTableInfo; class MachineModuleInfo; class MCContext; class Pass; class PseudoSourceValueManager; class TargetMachine; class TargetSubtargetInfo; class TargetRegisterClass; struct MachinePointerInfo; struct WinEHFuncInfo; template <> struct ilist_traits : public ilist_default_traits { mutable ilist_half_node Sentinel; public: MachineBasicBlock *createSentinel() const { return static_cast(&Sentinel); } void destroySentinel(MachineBasicBlock *) const {} MachineBasicBlock *provideInitialHead() const { return createSentinel(); } MachineBasicBlock *ensureHead(MachineBasicBlock*) const { return createSentinel(); } static void noteHead(MachineBasicBlock*, MachineBasicBlock*) {} void addNodeToList(MachineBasicBlock* MBB); void removeNodeFromList(MachineBasicBlock* MBB); void deleteNode(MachineBasicBlock *MBB); private: void createNode(const MachineBasicBlock &); }; /// MachineFunctionInfo - This class can be derived from and used by targets to /// hold private target-specific information for each MachineFunction. Objects /// of type are accessed/created with MF::getInfo and destroyed when the /// MachineFunction is destroyed. struct MachineFunctionInfo { virtual ~MachineFunctionInfo(); /// \brief Factory function: default behavior is to call new using the /// supplied allocator. /// /// This function can be overridden in a derive class. template static Ty *create(BumpPtrAllocator &Allocator, MachineFunction &MF) { return new (Allocator.Allocate()) Ty(MF); } }; class MachineFunction { const Function *Fn; const TargetMachine &Target; const TargetSubtargetInfo *STI; MCContext &Ctx; MachineModuleInfo &MMI; // RegInfo - Information about each register in use in the function. MachineRegisterInfo *RegInfo; // Used to keep track of target-specific per-machine function information for // the target implementation. MachineFunctionInfo *MFInfo; // Keep track of objects allocated on the stack. MachineFrameInfo *FrameInfo; // Keep track of constants which are spilled to memory MachineConstantPool *ConstantPool; // Keep track of jump tables for switch instructions MachineJumpTableInfo *JumpTableInfo; // Keeps track of Windows exception handling related data. This will be null // for functions that aren't using a funclet-based EH personality. WinEHFuncInfo *WinEHInfo = nullptr; // Function-level unique numbering for MachineBasicBlocks. When a // MachineBasicBlock is inserted into a MachineFunction is it automatically // numbered and this vector keeps track of the mapping from ID's to MBB's. std::vector MBBNumbering; // Pool-allocate MachineFunction-lifetime and IR objects. BumpPtrAllocator Allocator; // Allocation management for instructions in function. Recycler InstructionRecycler; // Allocation management for operand arrays on instructions. ArrayRecycler OperandRecycler; // Allocation management for basic blocks in function. Recycler BasicBlockRecycler; // List of machine basic blocks in function typedef ilist BasicBlockListType; BasicBlockListType BasicBlocks; /// FunctionNumber - This provides a unique ID for each function emitted in /// this translation unit. /// unsigned FunctionNumber; /// Alignment - The alignment of the function. unsigned Alignment; /// ExposesReturnsTwice - True if the function calls setjmp or related /// functions with attribute "returns twice", but doesn't have /// the attribute itself. /// This is used to limit optimizations which cannot reason /// about the control flow of such functions. bool ExposesReturnsTwice; /// True if the function includes any inline assembly. bool HasInlineAsm; // Allocation management for pseudo source values. std::unique_ptr PSVManager; MachineFunction(const MachineFunction &) = delete; void operator=(const MachineFunction&) = delete; public: MachineFunction(const Function *Fn, const TargetMachine &TM, unsigned FunctionNum, MachineModuleInfo &MMI); ~MachineFunction(); MachineModuleInfo &getMMI() const { return MMI; } MCContext &getContext() const { return Ctx; } PseudoSourceValueManager &getPSVManager() const { return *PSVManager; } /// Return the DataLayout attached to the Module associated to this MF. const DataLayout &getDataLayout() const; /// getFunction - Return the LLVM function that this machine code represents /// const Function *getFunction() const { return Fn; } /// getName - Return the name of the corresponding LLVM function. /// StringRef getName() const; /// getFunctionNumber - Return a unique ID for the current function. /// unsigned getFunctionNumber() const { return FunctionNumber; } /// getTarget - Return the target machine this machine code is compiled with /// const TargetMachine &getTarget() const { return Target; } /// getSubtarget - Return the subtarget for which this machine code is being /// compiled. const TargetSubtargetInfo &getSubtarget() const { return *STI; } void setSubtarget(const TargetSubtargetInfo *ST) { STI = ST; } /// getSubtarget - This method returns a pointer to the specified type of /// TargetSubtargetInfo. In debug builds, it verifies that the object being /// returned is of the correct type. template const STC &getSubtarget() const { return *static_cast(STI); } /// getRegInfo - Return information about the registers currently in use. /// MachineRegisterInfo &getRegInfo() { return *RegInfo; } const MachineRegisterInfo &getRegInfo() const { return *RegInfo; } /// getFrameInfo - Return the frame info object for the current function. /// This object contains information about objects allocated on the stack /// frame of the current function in an abstract way. /// MachineFrameInfo *getFrameInfo() { return FrameInfo; } const MachineFrameInfo *getFrameInfo() const { return FrameInfo; } /// getJumpTableInfo - Return the jump table info object for the current /// function. This object contains information about jump tables in the /// current function. If the current function has no jump tables, this will /// return null. const MachineJumpTableInfo *getJumpTableInfo() const { return JumpTableInfo; } MachineJumpTableInfo *getJumpTableInfo() { return JumpTableInfo; } /// getOrCreateJumpTableInfo - Get the JumpTableInfo for this function, if it /// does already exist, allocate one. MachineJumpTableInfo *getOrCreateJumpTableInfo(unsigned JTEntryKind); /// getConstantPool - Return the constant pool object for the current /// function. /// MachineConstantPool *getConstantPool() { return ConstantPool; } const MachineConstantPool *getConstantPool() const { return ConstantPool; } /// getWinEHFuncInfo - Return information about how the current function uses /// Windows exception handling. Returns null for functions that don't use /// funclets for exception handling. const WinEHFuncInfo *getWinEHFuncInfo() const { return WinEHInfo; } WinEHFuncInfo *getWinEHFuncInfo() { return WinEHInfo; } /// getAlignment - Return the alignment (log2, not bytes) of the function. /// unsigned getAlignment() const { return Alignment; } /// setAlignment - Set the alignment (log2, not bytes) of the function. /// void setAlignment(unsigned A) { Alignment = A; } /// ensureAlignment - Make sure the function is at least 1 << A bytes aligned. void ensureAlignment(unsigned A) { if (Alignment < A) Alignment = A; } /// exposesReturnsTwice - Returns true if the function calls setjmp or /// any other similar functions with attribute "returns twice" without /// having the attribute itself. bool exposesReturnsTwice() const { return ExposesReturnsTwice; } /// setCallsSetJmp - Set a flag that indicates if there's a call to /// a "returns twice" function. void setExposesReturnsTwice(bool B) { ExposesReturnsTwice = B; } /// Returns true if the function contains any inline assembly. bool hasInlineAsm() const { return HasInlineAsm; } /// Set a flag that indicates that the function contains inline assembly. void setHasInlineAsm(bool B) { HasInlineAsm = B; } /// getInfo - Keep track of various per-function pieces of information for /// backends that would like to do so. /// template Ty *getInfo() { if (!MFInfo) MFInfo = Ty::template create(Allocator, *this); return static_cast(MFInfo); } template const Ty *getInfo() const { return const_cast(this)->getInfo(); } /// getBlockNumbered - MachineBasicBlocks are automatically numbered when they /// are inserted into the machine function. The block number for a machine /// basic block can be found by using the MBB::getBlockNumber method, this /// method provides the inverse mapping. /// MachineBasicBlock *getBlockNumbered(unsigned N) const { assert(N < MBBNumbering.size() && "Illegal block number"); assert(MBBNumbering[N] && "Block was removed from the machine function!"); return MBBNumbering[N]; } /// Should we be emitting segmented stack stuff for the function - bool shouldSplitStack(); + bool shouldSplitStack() const; /// getNumBlockIDs - Return the number of MBB ID's allocated. /// unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } /// RenumberBlocks - This discards all of the MachineBasicBlock numbers and /// recomputes them. This guarantees that the MBB numbers are sequential, /// dense, and match the ordering of the blocks within the function. If a /// specific MachineBasicBlock is specified, only that block and those after /// it are renumbered. void RenumberBlocks(MachineBasicBlock *MBBFrom = nullptr); /// print - Print out the MachineFunction in a format suitable for debugging /// to the specified stream. /// void print(raw_ostream &OS, SlotIndexes* = nullptr) const; /// viewCFG - This function is meant for use from the debugger. You can just /// say 'call F->viewCFG()' and a ghostview window should pop up from the /// program, displaying the CFG of the current function with the code for each /// basic block inside. This depends on there being a 'dot' and 'gv' program /// in your path. /// void viewCFG() const; /// viewCFGOnly - This function is meant for use from the debugger. It works /// just like viewCFG, but it does not include the contents of basic blocks /// into the nodes, just the label. If you are only interested in the CFG /// this can make the graph smaller. /// void viewCFGOnly() const; /// dump - Print the current MachineFunction to cerr, useful for debugger use. /// void dump() const; /// verify - Run the current MachineFunction through the machine code /// verifier, useful for debugger use. void verify(Pass *p = nullptr, const char *Banner = nullptr) const; // Provide accessors for the MachineBasicBlock list... typedef BasicBlockListType::iterator iterator; typedef BasicBlockListType::const_iterator const_iterator; typedef std::reverse_iterator const_reverse_iterator; typedef std::reverse_iterator reverse_iterator; /// Support for MachineBasicBlock::getNextNode(). static BasicBlockListType MachineFunction::* getSublistAccess(MachineBasicBlock *) { return &MachineFunction::BasicBlocks; } /// addLiveIn - Add the specified physical register as a live-in value and /// create a corresponding virtual register for it. unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC); //===--------------------------------------------------------------------===// // BasicBlock accessor functions. // iterator begin() { return BasicBlocks.begin(); } const_iterator begin() const { return BasicBlocks.begin(); } iterator end () { return BasicBlocks.end(); } const_iterator end () const { return BasicBlocks.end(); } reverse_iterator rbegin() { return BasicBlocks.rbegin(); } const_reverse_iterator rbegin() const { return BasicBlocks.rbegin(); } reverse_iterator rend () { return BasicBlocks.rend(); } const_reverse_iterator rend () const { return BasicBlocks.rend(); } unsigned size() const { return (unsigned)BasicBlocks.size();} bool empty() const { return BasicBlocks.empty(); } const MachineBasicBlock &front() const { return BasicBlocks.front(); } MachineBasicBlock &front() { return BasicBlocks.front(); } const MachineBasicBlock & back() const { return BasicBlocks.back(); } MachineBasicBlock & back() { return BasicBlocks.back(); } void push_back (MachineBasicBlock *MBB) { BasicBlocks.push_back (MBB); } void push_front(MachineBasicBlock *MBB) { BasicBlocks.push_front(MBB); } void insert(iterator MBBI, MachineBasicBlock *MBB) { BasicBlocks.insert(MBBI, MBB); } void splice(iterator InsertPt, iterator MBBI) { BasicBlocks.splice(InsertPt, BasicBlocks, MBBI); } void splice(iterator InsertPt, MachineBasicBlock *MBB) { BasicBlocks.splice(InsertPt, BasicBlocks, MBB); } void splice(iterator InsertPt, iterator MBBI, iterator MBBE) { BasicBlocks.splice(InsertPt, BasicBlocks, MBBI, MBBE); } void remove(iterator MBBI) { BasicBlocks.remove(MBBI); } void remove(MachineBasicBlock *MBBI) { BasicBlocks.remove(MBBI); } void erase(iterator MBBI) { BasicBlocks.erase(MBBI); } void erase(MachineBasicBlock *MBBI) { BasicBlocks.erase(MBBI); } template void sort(Comp comp) { BasicBlocks.sort(comp); } //===--------------------------------------------------------------------===// // Internal functions used to automatically number MachineBasicBlocks // /// \brief Adds the MBB to the internal numbering. Returns the unique number /// assigned to the MBB. /// unsigned addToMBBNumbering(MachineBasicBlock *MBB) { MBBNumbering.push_back(MBB); return (unsigned)MBBNumbering.size()-1; } /// removeFromMBBNumbering - Remove the specific machine basic block from our /// tracker, this is only really to be used by the MachineBasicBlock /// implementation. void removeFromMBBNumbering(unsigned N) { assert(N < MBBNumbering.size() && "Illegal basic block #"); MBBNumbering[N] = nullptr; } /// CreateMachineInstr - Allocate a new MachineInstr. Use this instead /// of `new MachineInstr'. /// MachineInstr *CreateMachineInstr(const MCInstrDesc &MCID, DebugLoc DL, bool NoImp = false); /// CloneMachineInstr - Create a new MachineInstr which is a copy of the /// 'Orig' instruction, identical in all ways except the instruction /// has no parent, prev, or next. /// /// See also TargetInstrInfo::duplicate() for target-specific fixes to cloned /// instructions. MachineInstr *CloneMachineInstr(const MachineInstr *Orig); /// DeleteMachineInstr - Delete the given MachineInstr. /// void DeleteMachineInstr(MachineInstr *MI); /// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this /// instead of `new MachineBasicBlock'. /// MachineBasicBlock *CreateMachineBasicBlock(const BasicBlock *bb = nullptr); /// DeleteMachineBasicBlock - Delete the given MachineBasicBlock. /// void DeleteMachineBasicBlock(MachineBasicBlock *MBB); /// getMachineMemOperand - Allocate a new MachineMemOperand. /// MachineMemOperands are owned by the MachineFunction and need not be /// explicitly deallocated. MachineMemOperand *getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr); /// getMachineMemOperand - Allocate a new MachineMemOperand by copying /// an existing one, adjusting by an offset and using the given size. /// MachineMemOperands are owned by the MachineFunction and need not be /// explicitly deallocated. MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO, int64_t Offset, uint64_t Size); typedef ArrayRecycler::Capacity OperandCapacity; /// Allocate an array of MachineOperands. This is only intended for use by /// internal MachineInstr functions. MachineOperand *allocateOperandArray(OperandCapacity Cap) { return OperandRecycler.allocate(Cap, Allocator); } /// Dellocate an array of MachineOperands and recycle the memory. This is /// only intended for use by internal MachineInstr functions. /// Cap must be the same capacity that was used to allocate the array. void deallocateOperandArray(OperandCapacity Cap, MachineOperand *Array) { OperandRecycler.deallocate(Cap, Array); } /// \brief Allocate and initialize a register mask with @p NumRegister bits. uint32_t *allocateRegisterMask(unsigned NumRegister) { unsigned Size = (NumRegister + 31) / 32; uint32_t *Mask = Allocator.Allocate(Size); for (unsigned i = 0; i != Size; ++i) Mask[i] = 0; return Mask; } /// allocateMemRefsArray - Allocate an array to hold MachineMemOperand /// pointers. This array is owned by the MachineFunction. MachineInstr::mmo_iterator allocateMemRefsArray(unsigned long Num); /// extractLoadMemRefs - Allocate an array and populate it with just the /// load information from the given MachineMemOperand sequence. std::pair extractLoadMemRefs(MachineInstr::mmo_iterator Begin, MachineInstr::mmo_iterator End); /// extractStoreMemRefs - Allocate an array and populate it with just the /// store information from the given MachineMemOperand sequence. std::pair extractStoreMemRefs(MachineInstr::mmo_iterator Begin, MachineInstr::mmo_iterator End); /// Allocate a string and populate it with the given external symbol name. const char *createExternalSymbolName(StringRef Name); //===--------------------------------------------------------------------===// // Label Manipulation. // /// getJTISymbol - Return the MCSymbol for the specified non-empty jump table. /// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a /// normal 'L' label is returned. MCSymbol *getJTISymbol(unsigned JTI, MCContext &Ctx, bool isLinkerPrivate = false) const; /// getPICBaseSymbol - Return a function-local symbol to represent the PIC /// base. MCSymbol *getPICBaseSymbol() const; }; //===--------------------------------------------------------------------===// // GraphTraits specializations for function basic block graphs (CFGs) //===--------------------------------------------------------------------===// // Provide specializations of GraphTraits to be able to treat a // machine function as a graph of machine basic blocks... these are // the same as the machine basic block iterators, except that the root // node is implicitly the first node of the function. // template <> struct GraphTraits : public GraphTraits { static NodeType *getEntryNode(MachineFunction *F) { return &F->front(); } // nodes_iterator/begin/end - Allow iteration over all nodes in the graph typedef MachineFunction::iterator nodes_iterator; static nodes_iterator nodes_begin(MachineFunction *F) { return F->begin(); } static nodes_iterator nodes_end (MachineFunction *F) { return F->end(); } static unsigned size (MachineFunction *F) { return F->size(); } }; template <> struct GraphTraits : public GraphTraits { static NodeType *getEntryNode(const MachineFunction *F) { return &F->front(); } // nodes_iterator/begin/end - Allow iteration over all nodes in the graph typedef MachineFunction::const_iterator nodes_iterator; static nodes_iterator nodes_begin(const MachineFunction *F) { return F->begin(); } static nodes_iterator nodes_end (const MachineFunction *F) { return F->end(); } static unsigned size (const MachineFunction *F) { return F->size(); } }; // Provide specializations of GraphTraits to be able to treat a function as a // graph of basic blocks... and to walk it in inverse order. Inverse order for // a function is considered to be when traversing the predecessor edges of a BB // instead of the successor edges. // template <> struct GraphTraits > : public GraphTraits > { static NodeType *getEntryNode(Inverse G) { return &G.Graph->front(); } }; template <> struct GraphTraits > : public GraphTraits > { static NodeType *getEntryNode(Inverse G) { return &G.Graph->front(); } }; } // End llvm namespace #endif Index: projects/clang380-import/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- projects/clang380-import/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h (revision 294608) +++ projects/clang380-import/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h (revision 294609) @@ -1,2323 +1,2338 @@ //===-- llvm/CodeGen/SelectionDAGNodes.h - SelectionDAG Nodes ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file declares the SDNode class and derived classes, which are used to // represent the nodes and operations present in a SelectionDAG. These nodes // and operations are machine code level operations, with some similarities to // the GCC RTL representation. // // Clients should include the SelectionDAG.h file instead of this file directly. // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_SELECTIONDAGNODES_H #define LLVM_CODEGEN_SELECTIONDAGNODES_H #include "llvm/ADT/BitVector.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/MathExtras.h" #include namespace llvm { class SelectionDAG; class GlobalValue; class MachineBasicBlock; class MachineConstantPoolValue; class SDNode; class BinaryWithFlagsSDNode; class Value; class MCSymbol; template struct DenseMapInfo; template struct simplify_type; template struct ilist_traits; void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr, bool force = false); /// This represents a list of ValueType's that has been intern'd by /// a SelectionDAG. Instances of this simple value class are returned by /// SelectionDAG::getVTList(...). /// struct SDVTList { const EVT *VTs; unsigned int NumVTs; }; namespace ISD { /// Node predicates /// Return true if the specified node is a /// BUILD_VECTOR where all of the elements are ~0 or undef. bool isBuildVectorAllOnes(const SDNode *N); /// Return true if the specified node is a /// BUILD_VECTOR where all of the elements are 0 or undef. bool isBuildVectorAllZeros(const SDNode *N); /// \brief Return true if the specified node is a BUILD_VECTOR node of /// all ConstantSDNode or undef. bool isBuildVectorOfConstantSDNodes(const SDNode *N); /// \brief Return true if the specified node is a BUILD_VECTOR node of /// all ConstantFPSDNode or undef. bool isBuildVectorOfConstantFPSDNodes(const SDNode *N); /// Return true if the node has at least one operand /// and all operands of the specified node are ISD::UNDEF. bool allOperandsUndef(const SDNode *N); } // end llvm:ISD namespace //===----------------------------------------------------------------------===// /// Unlike LLVM values, Selection DAG nodes may return multiple /// values as the result of a computation. Many nodes return multiple values, /// from loads (which define a token and a return value) to ADDC (which returns /// a result and a carry value), to calls (which may return an arbitrary number /// of values). /// /// As such, each use of a SelectionDAG computation must indicate the node that /// computes it as well as which return value to use from that node. This pair /// of information is represented with the SDValue value type. /// class SDValue { friend struct DenseMapInfo; SDNode *Node; // The node defining the value we are using. unsigned ResNo; // Which return value of the node we are using. public: SDValue() : Node(nullptr), ResNo(0) {} SDValue(SDNode *node, unsigned resno); /// get the index which selects a specific result in the SDNode unsigned getResNo() const { return ResNo; } /// get the SDNode which holds the desired result SDNode *getNode() const { return Node; } /// set the SDNode void setNode(SDNode *N) { Node = N; } inline SDNode *operator->() const { return Node; } bool operator==(const SDValue &O) const { return Node == O.Node && ResNo == O.ResNo; } bool operator!=(const SDValue &O) const { return !operator==(O); } bool operator<(const SDValue &O) const { return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo); } explicit operator bool() const { return Node != nullptr; } SDValue getValue(unsigned R) const { return SDValue(Node, R); } /// Return true if this node is an operand of N. bool isOperandOf(const SDNode *N) const; /// Return the ValueType of the referenced return value. inline EVT getValueType() const; /// Return the simple ValueType of the referenced return value. MVT getSimpleValueType() const { return getValueType().getSimpleVT(); } /// Returns the size of the value in bits. unsigned getValueSizeInBits() const { return getValueType().getSizeInBits(); } unsigned getScalarValueSizeInBits() const { return getValueType().getScalarType().getSizeInBits(); } // Forwarding methods - These forward to the corresponding methods in SDNode. inline unsigned getOpcode() const; inline unsigned getNumOperands() const; inline const SDValue &getOperand(unsigned i) const; inline uint64_t getConstantOperandVal(unsigned i) const; inline bool isTargetMemoryOpcode() const; inline bool isTargetOpcode() const; inline bool isMachineOpcode() const; inline bool isUndef() const; inline unsigned getMachineOpcode() const; inline const DebugLoc &getDebugLoc() const; inline void dump() const; inline void dumpr() const; /// Return true if this operand (which must be a chain) reaches the /// specified operand without crossing any side-effecting instructions. /// In practice, this looks through token factors and non-volatile loads. /// In order to remain efficient, this only /// looks a couple of nodes in, it does not do an exhaustive search. bool reachesChainWithoutSideEffects(SDValue Dest, unsigned Depth = 2) const; /// Return true if there are no nodes using value ResNo of Node. inline bool use_empty() const; /// Return true if there is exactly one node using value ResNo of Node. inline bool hasOneUse() const; }; template<> struct DenseMapInfo { static inline SDValue getEmptyKey() { SDValue V; V.ResNo = -1U; return V; } static inline SDValue getTombstoneKey() { SDValue V; V.ResNo = -2U; return V; } static unsigned getHashValue(const SDValue &Val) { return ((unsigned)((uintptr_t)Val.getNode() >> 4) ^ (unsigned)((uintptr_t)Val.getNode() >> 9)) + Val.getResNo(); } static bool isEqual(const SDValue &LHS, const SDValue &RHS) { return LHS == RHS; } }; template <> struct isPodLike { static const bool value = true; }; /// Allow casting operators to work directly on /// SDValues as if they were SDNode*'s. template<> struct simplify_type { typedef SDNode* SimpleType; static SimpleType getSimplifiedValue(SDValue &Val) { return Val.getNode(); } }; template<> struct simplify_type { typedef /*const*/ SDNode* SimpleType; static SimpleType getSimplifiedValue(const SDValue &Val) { return Val.getNode(); } }; /// Represents a use of a SDNode. This class holds an SDValue, /// which records the SDNode being used and the result number, a /// pointer to the SDNode using the value, and Next and Prev pointers, /// which link together all the uses of an SDNode. /// class SDUse { /// Val - The value being used. SDValue Val; /// User - The user of this value. SDNode *User; /// Prev, Next - Pointers to the uses list of the SDNode referred by /// this operand. SDUse **Prev, *Next; SDUse(const SDUse &U) = delete; void operator=(const SDUse &U) = delete; public: SDUse() : Val(), User(nullptr), Prev(nullptr), Next(nullptr) {} /// Normally SDUse will just implicitly convert to an SDValue that it holds. operator const SDValue&() const { return Val; } /// If implicit conversion to SDValue doesn't work, the get() method returns /// the SDValue. const SDValue &get() const { return Val; } /// This returns the SDNode that contains this Use. SDNode *getUser() { return User; } /// Get the next SDUse in the use list. SDUse *getNext() const { return Next; } /// Convenience function for get().getNode(). SDNode *getNode() const { return Val.getNode(); } /// Convenience function for get().getResNo(). unsigned getResNo() const { return Val.getResNo(); } /// Convenience function for get().getValueType(). EVT getValueType() const { return Val.getValueType(); } /// Convenience function for get().operator== bool operator==(const SDValue &V) const { return Val == V; } /// Convenience function for get().operator!= bool operator!=(const SDValue &V) const { return Val != V; } /// Convenience function for get().operator< bool operator<(const SDValue &V) const { return Val < V; } private: friend class SelectionDAG; friend class SDNode; void setUser(SDNode *p) { User = p; } /// Remove this use from its existing use list, assign it the /// given value, and add it to the new value's node's use list. inline void set(const SDValue &V); /// Like set, but only supports initializing a newly-allocated /// SDUse with a non-null value. inline void setInitial(const SDValue &V); /// Like set, but only sets the Node portion of the value, /// leaving the ResNo portion unmodified. inline void setNode(SDNode *N); void addToList(SDUse **List) { Next = *List; if (Next) Next->Prev = &Next; Prev = List; *List = this; } void removeFromList() { *Prev = Next; if (Next) Next->Prev = Prev; } }; /// simplify_type specializations - Allow casting operators to work directly on /// SDValues as if they were SDNode*'s. template<> struct simplify_type { typedef SDNode* SimpleType; static SimpleType getSimplifiedValue(SDUse &Val) { return Val.getNode(); } }; /// These are IR-level optimization flags that may be propagated to SDNodes. /// TODO: This data structure should be shared by the IR optimizer and the /// the backend. struct SDNodeFlags { private: bool NoUnsignedWrap : 1; bool NoSignedWrap : 1; bool Exact : 1; bool UnsafeAlgebra : 1; bool NoNaNs : 1; bool NoInfs : 1; bool NoSignedZeros : 1; bool AllowReciprocal : 1; public: /// Default constructor turns off all optimization flags. SDNodeFlags() { NoUnsignedWrap = false; NoSignedWrap = false; Exact = false; UnsafeAlgebra = false; NoNaNs = false; NoInfs = false; NoSignedZeros = false; AllowReciprocal = false; } // These are mutators for each flag. void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; } void setNoSignedWrap(bool b) { NoSignedWrap = b; } void setExact(bool b) { Exact = b; } void setUnsafeAlgebra(bool b) { UnsafeAlgebra = b; } void setNoNaNs(bool b) { NoNaNs = b; } void setNoInfs(bool b) { NoInfs = b; } void setNoSignedZeros(bool b) { NoSignedZeros = b; } void setAllowReciprocal(bool b) { AllowReciprocal = b; } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return NoUnsignedWrap; } bool hasNoSignedWrap() const { return NoSignedWrap; } bool hasExact() const { return Exact; } bool hasUnsafeAlgebra() const { return UnsafeAlgebra; } bool hasNoNaNs() const { return NoNaNs; } bool hasNoInfs() const { return NoInfs; } bool hasNoSignedZeros() const { return NoSignedZeros; } bool hasAllowReciprocal() const { return AllowReciprocal; } /// Return a raw encoding of the flags. /// This function should only be used to add data to the NodeID value. unsigned getRawFlags() const { return (NoUnsignedWrap << 0) | (NoSignedWrap << 1) | (Exact << 2) | (UnsafeAlgebra << 3) | (NoNaNs << 4) | (NoInfs << 5) | (NoSignedZeros << 6) | (AllowReciprocal << 7); } + + /// Clear any flags in this flag set that aren't also set in Flags. + void intersectWith(const SDNodeFlags *Flags) { + NoUnsignedWrap &= Flags->NoUnsignedWrap; + NoSignedWrap &= Flags->NoSignedWrap; + Exact &= Flags->Exact; + UnsafeAlgebra &= Flags->UnsafeAlgebra; + NoNaNs &= Flags->NoNaNs; + NoInfs &= Flags->NoInfs; + NoSignedZeros &= Flags->NoSignedZeros; + AllowReciprocal &= Flags->AllowReciprocal; + } }; /// Represents one node in the SelectionDAG. /// class SDNode : public FoldingSetNode, public ilist_node { private: /// The operation that this node performs. int16_t NodeType; /// This is true if OperandList was new[]'d. If true, /// then they will be delete[]'d when the node is destroyed. uint16_t OperandsNeedDelete : 1; /// This tracks whether this node has one or more dbg_value /// nodes corresponding to it. uint16_t HasDebugValue : 1; protected: /// This member is defined by this class, but is not used for /// anything. Subclasses can use it to hold whatever state they find useful. /// This field is initialized to zero by the ctor. uint16_t SubclassData : 14; private: /// Unique id per SDNode in the DAG. int NodeId; /// The values that are used by this operation. SDUse *OperandList; /// The types of the values this node defines. SDNode's may /// define multiple values simultaneously. const EVT *ValueList; /// List of uses for this SDNode. SDUse *UseList; /// The number of entries in the Operand/Value list. unsigned short NumOperands, NumValues; // The ordering of the SDNodes. It roughly corresponds to the ordering of the // original LLVM instructions. // This is used for turning off scheduling, because we'll forgo // the normal scheduling algorithms and output the instructions according to // this ordering. unsigned IROrder; /// Source line information. DebugLoc debugLoc; /// Return a pointer to the specified value type. static const EVT *getValueTypeList(EVT VT); friend class SelectionDAG; friend struct ilist_traits; public: /// Unique and persistent id per SDNode in the DAG. /// Used for debug printing. uint16_t PersistentId; //===--------------------------------------------------------------------===// // Accessors // /// Return the SelectionDAG opcode value for this node. For /// pre-isel nodes (those for which isMachineOpcode returns false), these /// are the opcode values in the ISD and ISD namespaces. For /// post-isel opcodes, see getMachineOpcode. unsigned getOpcode() const { return (unsigned short)NodeType; } /// Test if this node has a target-specific opcode (in the /// \ISD namespace). bool isTargetOpcode() const { return NodeType >= ISD::BUILTIN_OP_END; } /// Test if this node has a target-specific /// memory-referencing opcode (in the \ISD namespace and /// greater than FIRST_TARGET_MEMORY_OPCODE). bool isTargetMemoryOpcode() const { return NodeType >= ISD::FIRST_TARGET_MEMORY_OPCODE; } /// Return true if the type of the node type undefined. bool isUndef() const { return NodeType == ISD::UNDEF; } /// Test if this node is a memory intrinsic (with valid pointer information). /// INTRINSIC_W_CHAIN and INTRINSIC_VOID nodes are sometimes created for /// non-memory intrinsics (with chains) that are not really instances of /// MemSDNode. For such nodes, we need some extra state to determine the /// proper classof relationship. bool isMemIntrinsic() const { return (NodeType == ISD::INTRINSIC_W_CHAIN || NodeType == ISD::INTRINSIC_VOID) && ((SubclassData >> 13) & 1); } /// Test if this node has a post-isel opcode, directly /// corresponding to a MachineInstr opcode. bool isMachineOpcode() const { return NodeType < 0; } /// This may only be called if isMachineOpcode returns /// true. It returns the MachineInstr opcode value that the node's opcode /// corresponds to. unsigned getMachineOpcode() const { assert(isMachineOpcode() && "Not a MachineInstr opcode!"); return ~NodeType; } /// Get this bit. bool getHasDebugValue() const { return HasDebugValue; } /// Set this bit. void setHasDebugValue(bool b) { HasDebugValue = b; } /// Return true if there are no uses of this node. bool use_empty() const { return UseList == nullptr; } /// Return true if there is exactly one use of this node. bool hasOneUse() const { return !use_empty() && std::next(use_begin()) == use_end(); } /// Return the number of uses of this node. This method takes /// time proportional to the number of uses. size_t use_size() const { return std::distance(use_begin(), use_end()); } /// Return the unique node id. int getNodeId() const { return NodeId; } /// Set unique node id. void setNodeId(int Id) { NodeId = Id; } /// Return the node ordering. unsigned getIROrder() const { return IROrder; } /// Set the node ordering. void setIROrder(unsigned Order) { IROrder = Order; } /// Return the source location info. const DebugLoc &getDebugLoc() const { return debugLoc; } /// Set source location info. Try to avoid this, putting /// it in the constructor is preferable. void setDebugLoc(DebugLoc dl) { debugLoc = std::move(dl); } /// This class provides iterator support for SDUse /// operands that use a specific SDNode. class use_iterator : public std::iterator { SDUse *Op; explicit use_iterator(SDUse *op) : Op(op) { } friend class SDNode; public: typedef std::iterator::reference reference; typedef std::iterator::pointer pointer; use_iterator(const use_iterator &I) : Op(I.Op) {} use_iterator() : Op(nullptr) {} bool operator==(const use_iterator &x) const { return Op == x.Op; } bool operator!=(const use_iterator &x) const { return !operator==(x); } /// Return true if this iterator is at the end of uses list. bool atEnd() const { return Op == nullptr; } // Iterator traversal: forward iteration only. use_iterator &operator++() { // Preincrement assert(Op && "Cannot increment end iterator!"); Op = Op->getNext(); return *this; } use_iterator operator++(int) { // Postincrement use_iterator tmp = *this; ++*this; return tmp; } /// Retrieve a pointer to the current user node. SDNode *operator*() const { assert(Op && "Cannot dereference end iterator!"); return Op->getUser(); } SDNode *operator->() const { return operator*(); } SDUse &getUse() const { return *Op; } /// Retrieve the operand # of this use in its user. unsigned getOperandNo() const { assert(Op && "Cannot dereference end iterator!"); return (unsigned)(Op - Op->getUser()->OperandList); } }; /// Provide iteration support to walk over all uses of an SDNode. use_iterator use_begin() const { return use_iterator(UseList); } static use_iterator use_end() { return use_iterator(nullptr); } inline iterator_range uses() { return make_range(use_begin(), use_end()); } inline iterator_range uses() const { return make_range(use_begin(), use_end()); } /// Return true if there are exactly NUSES uses of the indicated value. /// This method ignores uses of other values defined by this operation. bool hasNUsesOfValue(unsigned NUses, unsigned Value) const; /// Return true if there are any use of the indicated value. /// This method ignores uses of other values defined by this operation. bool hasAnyUseOfValue(unsigned Value) const; /// Return true if this node is the only use of N. bool isOnlyUserOf(const SDNode *N) const; /// Return true if this node is an operand of N. bool isOperandOf(const SDNode *N) const; /// Return true if this node is a predecessor of N. /// NOTE: Implemented on top of hasPredecessor and every bit as /// expensive. Use carefully. bool isPredecessorOf(const SDNode *N) const { return N->hasPredecessor(this); } /// Return true if N is a predecessor of this node. /// N is either an operand of this node, or can be reached by recursively /// traversing up the operands. /// NOTE: This is an expensive method. Use it carefully. bool hasPredecessor(const SDNode *N) const; /// Return true if N is a predecessor of this node. /// N is either an operand of this node, or can be reached by recursively /// traversing up the operands. /// In this helper the Visited and worklist sets are held externally to /// cache predecessors over multiple invocations. If you want to test for /// multiple predecessors this method is preferable to multiple calls to /// hasPredecessor. Be sure to clear Visited and Worklist if the DAG /// changes. /// NOTE: This is still very expensive. Use carefully. bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallVectorImpl &Worklist) const; /// Return the number of values used by this operation. unsigned getNumOperands() const { return NumOperands; } /// Helper method returns the integer value of a ConstantSDNode operand. uint64_t getConstantOperandVal(unsigned Num) const; const SDValue &getOperand(unsigned Num) const { assert(Num < NumOperands && "Invalid child # of SDNode!"); return OperandList[Num]; } typedef SDUse* op_iterator; op_iterator op_begin() const { return OperandList; } op_iterator op_end() const { return OperandList+NumOperands; } ArrayRef ops() const { return makeArrayRef(op_begin(), op_end()); } /// Iterator for directly iterating over the operand SDValue's. struct value_op_iterator : iterator_adaptor_base { explicit value_op_iterator(SDUse *U = nullptr) : iterator_adaptor_base(U) {} const SDValue &operator*() const { return I->get(); } }; iterator_range op_values() const { return make_range(value_op_iterator(op_begin()), value_op_iterator(op_end())); } SDVTList getVTList() const { SDVTList X = { ValueList, NumValues }; return X; } /// If this node has a glue operand, return the node /// to which the glue operand points. Otherwise return NULL. SDNode *getGluedNode() const { if (getNumOperands() != 0 && getOperand(getNumOperands()-1).getValueType() == MVT::Glue) return getOperand(getNumOperands()-1).getNode(); return nullptr; } /// If this node has a glue value with a user, return /// the user (there is at most one). Otherwise return NULL. SDNode *getGluedUser() const { for (use_iterator UI = use_begin(), UE = use_end(); UI != UE; ++UI) if (UI.getUse().get().getValueType() == MVT::Glue) return *UI; return nullptr; } /// This could be defined as a virtual function and implemented more simply /// and directly, but it is not to avoid creating a vtable for this class. const SDNodeFlags *getFlags() const; + + /// Clear any flags in this node that aren't also set in Flags. + void intersectFlagsWith(const SDNodeFlags *Flags); /// Return the number of values defined/returned by this operator. unsigned getNumValues() const { return NumValues; } /// Return the type of a specified result. EVT getValueType(unsigned ResNo) const { assert(ResNo < NumValues && "Illegal result number!"); return ValueList[ResNo]; } /// Return the type of a specified result as a simple type. MVT getSimpleValueType(unsigned ResNo) const { return getValueType(ResNo).getSimpleVT(); } /// Returns MVT::getSizeInBits(getValueType(ResNo)). unsigned getValueSizeInBits(unsigned ResNo) const { return getValueType(ResNo).getSizeInBits(); } typedef const EVT* value_iterator; value_iterator value_begin() const { return ValueList; } value_iterator value_end() const { return ValueList+NumValues; } /// Return the opcode of this operation for printing. std::string getOperationName(const SelectionDAG *G = nullptr) const; static const char* getIndexedModeName(ISD::MemIndexedMode AM); void print_types(raw_ostream &OS, const SelectionDAG *G) const; void print_details(raw_ostream &OS, const SelectionDAG *G) const; void print(raw_ostream &OS, const SelectionDAG *G = nullptr) const; void printr(raw_ostream &OS, const SelectionDAG *G = nullptr) const; /// Print a SelectionDAG node and all children down to /// the leaves. The given SelectionDAG allows target-specific nodes /// to be printed in human-readable form. Unlike printr, this will /// print the whole DAG, including children that appear multiple /// times. /// void printrFull(raw_ostream &O, const SelectionDAG *G = nullptr) const; /// Print a SelectionDAG node and children up to /// depth "depth." The given SelectionDAG allows target-specific /// nodes to be printed in human-readable form. Unlike printr, this /// will print children that appear multiple times wherever they are /// used. /// void printrWithDepth(raw_ostream &O, const SelectionDAG *G = nullptr, unsigned depth = 100) const; /// Dump this node, for debugging. void dump() const; /// Dump (recursively) this node and its use-def subgraph. void dumpr() const; /// Dump this node, for debugging. /// The given SelectionDAG allows target-specific nodes to be printed /// in human-readable form. void dump(const SelectionDAG *G) const; /// Dump (recursively) this node and its use-def subgraph. /// The given SelectionDAG allows target-specific nodes to be printed /// in human-readable form. void dumpr(const SelectionDAG *G) const; /// printrFull to dbgs(). The given SelectionDAG allows /// target-specific nodes to be printed in human-readable form. /// Unlike dumpr, this will print the whole DAG, including children /// that appear multiple times. void dumprFull(const SelectionDAG *G = nullptr) const; /// printrWithDepth to dbgs(). The given /// SelectionDAG allows target-specific nodes to be printed in /// human-readable form. Unlike dumpr, this will print children /// that appear multiple times wherever they are used. /// void dumprWithDepth(const SelectionDAG *G = nullptr, unsigned depth = 100) const; /// Gather unique data for the node. void Profile(FoldingSetNodeID &ID) const; /// This method should only be used by the SDUse class. void addUse(SDUse &U) { U.addToList(&UseList); } protected: static SDVTList getSDVTList(EVT VT) { SDVTList Ret = { getValueTypeList(VT), 1 }; return Ret; } SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops) : NodeType(Opc), OperandsNeedDelete(true), HasDebugValue(false), SubclassData(0), NodeId(-1), OperandList(Ops.size() ? new SDUse[Ops.size()] : nullptr), ValueList(VTs.VTs), UseList(nullptr), NumOperands(Ops.size()), NumValues(VTs.NumVTs), IROrder(Order), debugLoc(std::move(dl)) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); assert(NumOperands == Ops.size() && "NumOperands wasn't wide enough for its operands!"); assert(NumValues == VTs.NumVTs && "NumValues wasn't wide enough for its operands!"); for (unsigned i = 0; i != Ops.size(); ++i) { assert(OperandList && "no operands available"); OperandList[i].setUser(this); OperandList[i].setInitial(Ops[i]); } checkForCycles(this); } /// This constructor adds no operands itself; operands can be /// set later with InitOperands. SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs) : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false), SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs), UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs), IROrder(Order), debugLoc(std::move(dl)) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); assert(NumValues == VTs.NumVTs && "NumValues wasn't wide enough for its operands!"); } /// Initialize the operands list of this with 1 operand. void InitOperands(SDUse *Ops, const SDValue &Op0) { Ops[0].setUser(this); Ops[0].setInitial(Op0); NumOperands = 1; OperandList = Ops; checkForCycles(this); } /// Initialize the operands list of this with 2 operands. void InitOperands(SDUse *Ops, const SDValue &Op0, const SDValue &Op1) { Ops[0].setUser(this); Ops[0].setInitial(Op0); Ops[1].setUser(this); Ops[1].setInitial(Op1); NumOperands = 2; OperandList = Ops; checkForCycles(this); } /// Initialize the operands list of this with 3 operands. void InitOperands(SDUse *Ops, const SDValue &Op0, const SDValue &Op1, const SDValue &Op2) { Ops[0].setUser(this); Ops[0].setInitial(Op0); Ops[1].setUser(this); Ops[1].setInitial(Op1); Ops[2].setUser(this); Ops[2].setInitial(Op2); NumOperands = 3; OperandList = Ops; checkForCycles(this); } /// Initialize the operands list of this with 4 operands. void InitOperands(SDUse *Ops, const SDValue &Op0, const SDValue &Op1, const SDValue &Op2, const SDValue &Op3) { Ops[0].setUser(this); Ops[0].setInitial(Op0); Ops[1].setUser(this); Ops[1].setInitial(Op1); Ops[2].setUser(this); Ops[2].setInitial(Op2); Ops[3].setUser(this); Ops[3].setInitial(Op3); NumOperands = 4; OperandList = Ops; checkForCycles(this); } /// Initialize the operands list of this with N operands. void InitOperands(SDUse *Ops, const SDValue *Vals, unsigned N) { for (unsigned i = 0; i != N; ++i) { Ops[i].setUser(this); Ops[i].setInitial(Vals[i]); } NumOperands = N; assert(NumOperands == N && "NumOperands wasn't wide enough for its operands!"); OperandList = Ops; checkForCycles(this); } /// Release the operands and set this node to have zero operands. void DropOperands(); }; /// Wrapper class for IR location info (IR ordering and DebugLoc) to be passed /// into SDNode creation functions. /// When an SDNode is created from the DAGBuilder, the DebugLoc is extracted /// from the original Instruction, and IROrder is the ordinal position of /// the instruction. /// When an SDNode is created after the DAG is being built, both DebugLoc and /// the IROrder are propagated from the original SDNode. /// So SDLoc class provides two constructors besides the default one, one to /// be used by the DAGBuilder, the other to be used by others. class SDLoc { private: // Ptr could be used for either Instruction* or SDNode*. It is used for // Instruction* if IROrder is not -1. const void *Ptr; int IROrder; public: SDLoc() : Ptr(nullptr), IROrder(0) {} SDLoc(const SDNode *N) : Ptr(N), IROrder(-1) { assert(N && "null SDNode"); } SDLoc(const SDValue V) : Ptr(V.getNode()), IROrder(-1) { assert(Ptr && "null SDNode"); } SDLoc(const Instruction *I, int Order) : Ptr(I), IROrder(Order) { assert(Order >= 0 && "bad IROrder"); } unsigned getIROrder() { if (IROrder >= 0 || Ptr == nullptr) { return (unsigned)IROrder; } const SDNode *N = (const SDNode*)(Ptr); return N->getIROrder(); } DebugLoc getDebugLoc() { if (!Ptr) { return DebugLoc(); } if (IROrder >= 0) { const Instruction *I = (const Instruction*)(Ptr); return I->getDebugLoc(); } const SDNode *N = (const SDNode*)(Ptr); return N->getDebugLoc(); } }; // Define inline functions from the SDValue class. inline SDValue::SDValue(SDNode *node, unsigned resno) : Node(node), ResNo(resno) { assert((!Node || ResNo < Node->getNumValues()) && "Invalid result number for the given node!"); assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps."); } inline unsigned SDValue::getOpcode() const { return Node->getOpcode(); } inline EVT SDValue::getValueType() const { return Node->getValueType(ResNo); } inline unsigned SDValue::getNumOperands() const { return Node->getNumOperands(); } inline const SDValue &SDValue::getOperand(unsigned i) const { return Node->getOperand(i); } inline uint64_t SDValue::getConstantOperandVal(unsigned i) const { return Node->getConstantOperandVal(i); } inline bool SDValue::isTargetOpcode() const { return Node->isTargetOpcode(); } inline bool SDValue::isTargetMemoryOpcode() const { return Node->isTargetMemoryOpcode(); } inline bool SDValue::isMachineOpcode() const { return Node->isMachineOpcode(); } inline unsigned SDValue::getMachineOpcode() const { return Node->getMachineOpcode(); } inline bool SDValue::isUndef() const { return Node->isUndef(); } inline bool SDValue::use_empty() const { return !Node->hasAnyUseOfValue(ResNo); } inline bool SDValue::hasOneUse() const { return Node->hasNUsesOfValue(1, ResNo); } inline const DebugLoc &SDValue::getDebugLoc() const { return Node->getDebugLoc(); } inline void SDValue::dump() const { return Node->dump(); } inline void SDValue::dumpr() const { return Node->dumpr(); } // Define inline functions from the SDUse class. inline void SDUse::set(const SDValue &V) { if (Val.getNode()) removeFromList(); Val = V; if (V.getNode()) V.getNode()->addUse(*this); } inline void SDUse::setInitial(const SDValue &V) { Val = V; V.getNode()->addUse(*this); } inline void SDUse::setNode(SDNode *N) { if (Val.getNode()) removeFromList(); Val.setNode(N); if (N) N->addUse(*this); } /// This class is used for single-operand SDNodes. This is solely /// to allow co-allocation of node operands with the node itself. class UnarySDNode : public SDNode { SDUse Op; public: UnarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X) : SDNode(Opc, Order, dl, VTs) { InitOperands(&Op, X); } }; /// This class is used for two-operand SDNodes. This is solely /// to allow co-allocation of node operands with the node itself. class BinarySDNode : public SDNode { SDUse Ops[2]; public: BinarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y) : SDNode(Opc, Order, dl, VTs) { InitOperands(Ops, X, Y); } }; /// Returns true if the opcode is a binary operation with flags. static bool isBinOpWithFlags(unsigned Opcode) { switch (Opcode) { case ISD::SDIV: case ISD::UDIV: case ISD::SRA: case ISD::SRL: case ISD::MUL: case ISD::ADD: case ISD::SUB: case ISD::SHL: case ISD::FADD: case ISD::FDIV: case ISD::FMUL: case ISD::FREM: case ISD::FSUB: return true; default: return false; } } /// This class is an extension of BinarySDNode /// used from those opcodes that have associated extra flags. class BinaryWithFlagsSDNode : public BinarySDNode { public: SDNodeFlags Flags; BinaryWithFlagsSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y, const SDNodeFlags &NodeFlags) : BinarySDNode(Opc, Order, dl, VTs, X, Y), Flags(NodeFlags) {} static bool classof(const SDNode *N) { return isBinOpWithFlags(N->getOpcode()); } }; /// This class is used for three-operand SDNodes. This is solely /// to allow co-allocation of node operands with the node itself. class TernarySDNode : public SDNode { SDUse Ops[3]; public: TernarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y, SDValue Z) : SDNode(Opc, Order, dl, VTs) { InitOperands(Ops, X, Y, Z); } }; /// This class is used to form a handle around another node that /// is persistent and is updated across invocations of replaceAllUsesWith on its /// operand. This node should be directly created by end-users and not added to /// the AllNodes list. class HandleSDNode : public SDNode { SDUse Op; public: explicit HandleSDNode(SDValue X) : SDNode(ISD::HANDLENODE, 0, DebugLoc(), getSDVTList(MVT::Other)) { // HandleSDNodes are never inserted into the DAG, so they won't be // auto-numbered. Use ID 65535 as a sentinel. PersistentId = 0xffff; InitOperands(&Op, X); } ~HandleSDNode(); const SDValue &getValue() const { return Op; } }; class AddrSpaceCastSDNode : public UnarySDNode { private: unsigned SrcAddrSpace; unsigned DestAddrSpace; public: AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT, SDValue X, unsigned SrcAS, unsigned DestAS); unsigned getSrcAddressSpace() const { return SrcAddrSpace; } unsigned getDestAddressSpace() const { return DestAddrSpace; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ADDRSPACECAST; } }; /// This is an abstract virtual class for memory operations. class MemSDNode : public SDNode { private: // VT of in-memory value. EVT MemoryVT; protected: /// Memory reference information. MachineMemOperand *MMO; public: MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, EVT MemoryVT, MachineMemOperand *MMO); MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops, EVT MemoryVT, MachineMemOperand *MMO); bool readMem() const { return MMO->isLoad(); } bool writeMem() const { return MMO->isStore(); } /// Returns alignment and volatility of the memory access unsigned getOriginalAlignment() const { return MMO->getBaseAlignment(); } unsigned getAlignment() const { return MMO->getAlignment(); } /// Return the SubclassData value, which contains an /// encoding of the volatile flag, as well as bits used by subclasses. This /// function should only be used to compute a FoldingSetNodeID value. unsigned getRawSubclassData() const { return SubclassData; } // We access subclass data here so that we can check consistency // with MachineMemOperand information. bool isVolatile() const { return (SubclassData >> 5) & 1; } bool isNonTemporal() const { return (SubclassData >> 6) & 1; } bool isInvariant() const { return (SubclassData >> 7) & 1; } AtomicOrdering getOrdering() const { return AtomicOrdering((SubclassData >> 8) & 15); } SynchronizationScope getSynchScope() const { return SynchronizationScope((SubclassData >> 12) & 1); } // Returns the offset from the location of the access. int64_t getSrcValueOffset() const { return MMO->getOffset(); } /// Returns the AA info that describes the dereference. AAMDNodes getAAInfo() const { return MMO->getAAInfo(); } /// Returns the Ranges that describes the dereference. const MDNode *getRanges() const { return MMO->getRanges(); } /// Return the type of the in-memory value. EVT getMemoryVT() const { return MemoryVT; } /// Return a MachineMemOperand object describing the memory /// reference performed by operation. MachineMemOperand *getMemOperand() const { return MMO; } const MachinePointerInfo &getPointerInfo() const { return MMO->getPointerInfo(); } /// Return the address space for the associated pointer unsigned getAddressSpace() const { return getPointerInfo().getAddrSpace(); } /// Update this MemSDNode's MachineMemOperand information /// to reflect the alignment of NewMMO, if it has a greater alignment. /// This must only be used when the new alignment applies to all users of /// this MachineMemOperand. void refineAlignment(const MachineMemOperand *NewMMO) { MMO->refineAlignment(NewMMO); } const SDValue &getChain() const { return getOperand(0); } const SDValue &getBasePtr() const { return getOperand(getOpcode() == ISD::STORE ? 2 : 1); } // Methods to support isa and dyn_cast static bool classof(const SDNode *N) { // For some targets, we lower some target intrinsics to a MemIntrinsicNode // with either an intrinsic or a target opcode. return N->getOpcode() == ISD::LOAD || N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::PREFETCH || N->getOpcode() == ISD::ATOMIC_CMP_SWAP || N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS || N->getOpcode() == ISD::ATOMIC_SWAP || N->getOpcode() == ISD::ATOMIC_LOAD_ADD || N->getOpcode() == ISD::ATOMIC_LOAD_SUB || N->getOpcode() == ISD::ATOMIC_LOAD_AND || N->getOpcode() == ISD::ATOMIC_LOAD_OR || N->getOpcode() == ISD::ATOMIC_LOAD_XOR || N->getOpcode() == ISD::ATOMIC_LOAD_NAND || N->getOpcode() == ISD::ATOMIC_LOAD_MIN || N->getOpcode() == ISD::ATOMIC_LOAD_MAX || N->getOpcode() == ISD::ATOMIC_LOAD_UMIN || N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE || N->getOpcode() == ISD::MLOAD || N->getOpcode() == ISD::MSTORE || N->getOpcode() == ISD::MGATHER || N->getOpcode() == ISD::MSCATTER || N->isMemIntrinsic() || N->isTargetMemoryOpcode(); } }; /// This is an SDNode representing atomic operations. class AtomicSDNode : public MemSDNode { SDUse Ops[4]; /// For cmpxchg instructions, the ordering requirements when a store does not /// occur. AtomicOrdering FailureOrdering; void InitAtomic(AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { // This must match encodeMemSDNodeFlags() in SelectionDAG.cpp. assert((SuccessOrdering & 15) == SuccessOrdering && "Ordering may not require more than 4 bits!"); assert((FailureOrdering & 15) == FailureOrdering && "Ordering may not require more than 4 bits!"); assert((SynchScope & 1) == SynchScope && "SynchScope may not require more than 1 bit!"); SubclassData |= SuccessOrdering << 8; SubclassData |= SynchScope << 12; this->FailureOrdering = FailureOrdering; assert(getSuccessOrdering() == SuccessOrdering && "Ordering encoding error!"); assert(getFailureOrdering() == FailureOrdering && "Ordering encoding error!"); assert(getSynchScope() == SynchScope && "Synch-scope encoding error!"); } public: // Opc: opcode for atomic // VTL: value type list // Chain: memory chain for operaand // Ptr: address to update as a SDValue // Cmp: compare value // Swp: swap value // SrcVal: address to update as a Value (used for MemOperand) // Align: alignment of memory AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(Ordering, Ordering, SynchScope); InitOperands(Ops, Chain, Ptr, Cmp, Swp); } AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(Ordering, Ordering, SynchScope); InitOperands(Ops, Chain, Ptr, Val); } AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(Ordering, Ordering, SynchScope); InitOperands(Ops, Chain, Ptr); } AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT, const SDValue* AllOps, SDUse *DynOps, unsigned NumOps, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) { InitAtomic(SuccessOrdering, FailureOrdering, SynchScope); assert((DynOps || NumOps <= array_lengthof(Ops)) && "Too many ops for internal storage!"); InitOperands(DynOps ? DynOps : Ops, AllOps, NumOps); } const SDValue &getBasePtr() const { return getOperand(1); } const SDValue &getVal() const { return getOperand(2); } AtomicOrdering getSuccessOrdering() const { return getOrdering(); } // Not quite enough room in SubclassData for everything, so failure gets its // own field. AtomicOrdering getFailureOrdering() const { return FailureOrdering; } bool isCompareAndSwap() const { unsigned Op = getOpcode(); return Op == ISD::ATOMIC_CMP_SWAP || Op == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS; } // Methods to support isa and dyn_cast static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ATOMIC_CMP_SWAP || N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS || N->getOpcode() == ISD::ATOMIC_SWAP || N->getOpcode() == ISD::ATOMIC_LOAD_ADD || N->getOpcode() == ISD::ATOMIC_LOAD_SUB || N->getOpcode() == ISD::ATOMIC_LOAD_AND || N->getOpcode() == ISD::ATOMIC_LOAD_OR || N->getOpcode() == ISD::ATOMIC_LOAD_XOR || N->getOpcode() == ISD::ATOMIC_LOAD_NAND || N->getOpcode() == ISD::ATOMIC_LOAD_MIN || N->getOpcode() == ISD::ATOMIC_LOAD_MAX || N->getOpcode() == ISD::ATOMIC_LOAD_UMIN || N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE; } }; /// This SDNode is used for target intrinsics that touch /// memory and need an associated MachineMemOperand. Its opcode may be /// INTRINSIC_VOID, INTRINSIC_W_CHAIN, PREFETCH, or a target-specific opcode /// with a value not less than FIRST_TARGET_MEMORY_OPCODE. class MemIntrinsicSDNode : public MemSDNode { public: MemIntrinsicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops, EVT MemoryVT, MachineMemOperand *MMO) : MemSDNode(Opc, Order, dl, VTs, Ops, MemoryVT, MMO) { SubclassData |= 1u << 13; } // Methods to support isa and dyn_cast static bool classof(const SDNode *N) { // We lower some target intrinsics to their target opcode // early a node with a target opcode can be of this class return N->isMemIntrinsic() || N->getOpcode() == ISD::PREFETCH || N->isTargetMemoryOpcode(); } }; /// This SDNode is used to implement the code generator /// support for the llvm IR shufflevector instruction. It combines elements /// from two input vectors into a new input vector, with the selection and /// ordering of elements determined by an array of integers, referred to as /// the shuffle mask. For input vectors of width N, mask indices of 0..N-1 /// refer to elements from the LHS input, and indices from N to 2N-1 the RHS. /// An index of -1 is treated as undef, such that the code generator may put /// any value in the corresponding element of the result. class ShuffleVectorSDNode : public SDNode { SDUse Ops[2]; // The memory for Mask is owned by the SelectionDAG's OperandAllocator, and // is freed when the SelectionDAG object is destroyed. const int *Mask; protected: friend class SelectionDAG; ShuffleVectorSDNode(EVT VT, unsigned Order, DebugLoc dl, SDValue N1, SDValue N2, const int *M) : SDNode(ISD::VECTOR_SHUFFLE, Order, dl, getSDVTList(VT)), Mask(M) { InitOperands(Ops, N1, N2); } public: ArrayRef getMask() const { EVT VT = getValueType(0); return makeArrayRef(Mask, VT.getVectorNumElements()); } int getMaskElt(unsigned Idx) const { assert(Idx < getValueType(0).getVectorNumElements() && "Idx out of range!"); return Mask[Idx]; } bool isSplat() const { return isSplatMask(Mask, getValueType(0)); } int getSplatIndex() const { assert(isSplat() && "Cannot get splat index for non-splat!"); EVT VT = getValueType(0); for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (Mask[i] >= 0) return Mask[i]; } llvm_unreachable("Splat with all undef indices?"); } static bool isSplatMask(const int *Mask, EVT VT); /// Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void commuteMask(SmallVectorImpl &Mask) { unsigned NumElems = Mask.size(); for (unsigned i = 0; i != NumElems; ++i) { int idx = Mask[i]; if (idx < 0) continue; else if (idx < (int)NumElems) Mask[i] = idx + NumElems; else Mask[i] = idx - NumElems; } } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::VECTOR_SHUFFLE; } }; class ConstantSDNode : public SDNode { const ConstantInt *Value; friend class SelectionDAG; ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val, DebugLoc DL, EVT VT) : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DL, getSDVTList(VT)), Value(val) { SubclassData |= (uint16_t)isOpaque; } public: const ConstantInt *getConstantIntValue() const { return Value; } const APInt &getAPIntValue() const { return Value->getValue(); } uint64_t getZExtValue() const { return Value->getZExtValue(); } int64_t getSExtValue() const { return Value->getSExtValue(); } bool isOne() const { return Value->isOne(); } bool isNullValue() const { return Value->isNullValue(); } bool isAllOnesValue() const { return Value->isAllOnesValue(); } bool isOpaque() const { return SubclassData & 1; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::Constant || N->getOpcode() == ISD::TargetConstant; } }; class ConstantFPSDNode : public SDNode { const ConstantFP *Value; friend class SelectionDAG; ConstantFPSDNode(bool isTarget, const ConstantFP *val, DebugLoc DL, EVT VT) : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0, DL, getSDVTList(VT)), Value(val) { } public: const APFloat& getValueAPF() const { return Value->getValueAPF(); } const ConstantFP *getConstantFPValue() const { return Value; } /// Return true if the value is positive or negative zero. bool isZero() const { return Value->isZero(); } /// Return true if the value is a NaN. bool isNaN() const { return Value->isNaN(); } /// Return true if the value is an infinity bool isInfinity() const { return Value->isInfinity(); } /// Return true if the value is negative. bool isNegative() const { return Value->isNegative(); } /// We don't rely on operator== working on double values, as /// it returns true for things that are clearly not equal, like -0.0 and 0.0. /// As such, this method can be used to do an exact bit-for-bit comparison of /// two floating point values. /// We leave the version with the double argument here because it's just so /// convenient to write "2.0" and the like. Without this function we'd /// have to duplicate its logic everywhere it's called. bool isExactlyValue(double V) const { bool ignored; APFloat Tmp(V); Tmp.convert(Value->getValueAPF().getSemantics(), APFloat::rmNearestTiesToEven, &ignored); return isExactlyValue(Tmp); } bool isExactlyValue(const APFloat& V) const; static bool isValueValidForType(EVT VT, const APFloat& Val); static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ConstantFP || N->getOpcode() == ISD::TargetConstantFP; } }; /// Returns true if \p V is a constant integer zero. bool isNullConstant(SDValue V); /// Returns true if \p V is an FP constant with a value of positive zero. bool isNullFPConstant(SDValue V); /// Returns true if \p V is an integer constant with all bits set. bool isAllOnesConstant(SDValue V); /// Returns true if \p V is a constant integer one. bool isOneConstant(SDValue V); class GlobalAddressSDNode : public SDNode { const GlobalValue *TheGlobal; int64_t Offset; unsigned char TargetFlags; friend class SelectionDAG; GlobalAddressSDNode(unsigned Opc, unsigned Order, DebugLoc DL, const GlobalValue *GA, EVT VT, int64_t o, unsigned char TargetFlags); public: const GlobalValue *getGlobal() const { return TheGlobal; } int64_t getOffset() const { return Offset; } unsigned char getTargetFlags() const { return TargetFlags; } // Return the address space this GlobalAddress belongs to. unsigned getAddressSpace() const; static bool classof(const SDNode *N) { return N->getOpcode() == ISD::GlobalAddress || N->getOpcode() == ISD::TargetGlobalAddress || N->getOpcode() == ISD::GlobalTLSAddress || N->getOpcode() == ISD::TargetGlobalTLSAddress; } }; class FrameIndexSDNode : public SDNode { int FI; friend class SelectionDAG; FrameIndexSDNode(int fi, EVT VT, bool isTarg) : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex, 0, DebugLoc(), getSDVTList(VT)), FI(fi) { } public: int getIndex() const { return FI; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::FrameIndex || N->getOpcode() == ISD::TargetFrameIndex; } }; class JumpTableSDNode : public SDNode { int JTI; unsigned char TargetFlags; friend class SelectionDAG; JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned char TF) : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable, 0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) { } public: int getIndex() const { return JTI; } unsigned char getTargetFlags() const { return TargetFlags; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::JumpTable || N->getOpcode() == ISD::TargetJumpTable; } }; class ConstantPoolSDNode : public SDNode { union { const Constant *ConstVal; MachineConstantPoolValue *MachineCPVal; } Val; int Offset; // It's a MachineConstantPoolValue if top bit is set. unsigned Alignment; // Minimum alignment requirement of CP (not log2 value). unsigned char TargetFlags; friend class SelectionDAG; ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o, unsigned Align, unsigned char TF) : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0, DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align), TargetFlags(TF) { assert(Offset >= 0 && "Offset is too large"); Val.ConstVal = c; } ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v, EVT VT, int o, unsigned Align, unsigned char TF) : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0, DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align), TargetFlags(TF) { assert(Offset >= 0 && "Offset is too large"); Val.MachineCPVal = v; Offset |= 1 << (sizeof(unsigned)*CHAR_BIT-1); } public: bool isMachineConstantPoolEntry() const { return Offset < 0; } const Constant *getConstVal() const { assert(!isMachineConstantPoolEntry() && "Wrong constantpool type"); return Val.ConstVal; } MachineConstantPoolValue *getMachineCPVal() const { assert(isMachineConstantPoolEntry() && "Wrong constantpool type"); return Val.MachineCPVal; } int getOffset() const { return Offset & ~(1 << (sizeof(unsigned)*CHAR_BIT-1)); } // Return the alignment of this constant pool object, which is either 0 (for // default alignment) or the desired value. unsigned getAlignment() const { return Alignment; } unsigned char getTargetFlags() const { return TargetFlags; } Type *getType() const; static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ConstantPool || N->getOpcode() == ISD::TargetConstantPool; } }; /// Completely target-dependent object reference. class TargetIndexSDNode : public SDNode { unsigned char TargetFlags; int Index; int64_t Offset; friend class SelectionDAG; public: TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned char TF) : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)), TargetFlags(TF), Index(Idx), Offset(Ofs) {} public: unsigned char getTargetFlags() const { return TargetFlags; } int getIndex() const { return Index; } int64_t getOffset() const { return Offset; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::TargetIndex; } }; class BasicBlockSDNode : public SDNode { MachineBasicBlock *MBB; friend class SelectionDAG; /// Debug info is meaningful and potentially useful here, but we create /// blocks out of order when they're jumped to, which makes it a bit /// harder. Let's see if we need it first. explicit BasicBlockSDNode(MachineBasicBlock *mbb) : SDNode(ISD::BasicBlock, 0, DebugLoc(), getSDVTList(MVT::Other)), MBB(mbb) {} public: MachineBasicBlock *getBasicBlock() const { return MBB; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::BasicBlock; } }; /// A "pseudo-class" with methods for operating on BUILD_VECTORs. class BuildVectorSDNode : public SDNode { // These are constructed as SDNodes and then cast to BuildVectorSDNodes. explicit BuildVectorSDNode() = delete; public: /// Check if this is a constant splat, and if so, find the /// smallest element size that splats the vector. If MinSplatBits is /// nonzero, the element size must be at least that large. Note that the /// splat element may be the entire vector (i.e., a one element vector). /// Returns the splat element value in SplatValue. Any undefined bits in /// that value are zero, and the corresponding bits in the SplatUndef mask /// are set. The SplatBitSize value is set to the splat element size in /// bits. HasAnyUndefs is set to true if any bits in the vector are /// undefined. isBigEndian describes the endianness of the target. bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits = 0, bool isBigEndian = false) const; /// \brief Returns the splatted value or a null value if this is not a splat. /// /// If passed a non-null UndefElements bitvector, it will resize it to match /// the vector width and set the bits where elements are undef. SDValue getSplatValue(BitVector *UndefElements = nullptr) const; /// \brief Returns the splatted constant or null if this is not a constant /// splat. /// /// If passed a non-null UndefElements bitvector, it will resize it to match /// the vector width and set the bits where elements are undef. ConstantSDNode * getConstantSplatNode(BitVector *UndefElements = nullptr) const; /// \brief Returns the splatted constant FP or null if this is not a constant /// FP splat. /// /// If passed a non-null UndefElements bitvector, it will resize it to match /// the vector width and set the bits where elements are undef. ConstantFPSDNode * getConstantFPSplatNode(BitVector *UndefElements = nullptr) const; /// \brief If this is a constant FP splat and the splatted constant FP is an /// exact power or 2, return the log base 2 integer value. Otherwise, /// return -1. /// /// The BitWidth specifies the necessary bit precision. int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const; bool isConstant() const; static inline bool classof(const SDNode *N) { return N->getOpcode() == ISD::BUILD_VECTOR; } }; /// An SDNode that holds an arbitrary LLVM IR Value. This is /// used when the SelectionDAG needs to make a simple reference to something /// in the LLVM IR representation. /// class SrcValueSDNode : public SDNode { const Value *V; friend class SelectionDAG; /// Create a SrcValue for a general value. explicit SrcValueSDNode(const Value *v) : SDNode(ISD::SRCVALUE, 0, DebugLoc(), getSDVTList(MVT::Other)), V(v) {} public: /// Return the contained Value. const Value *getValue() const { return V; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::SRCVALUE; } }; class MDNodeSDNode : public SDNode { const MDNode *MD; friend class SelectionDAG; explicit MDNodeSDNode(const MDNode *md) : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md) {} public: const MDNode *getMD() const { return MD; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MDNODE_SDNODE; } }; class RegisterSDNode : public SDNode { unsigned Reg; friend class SelectionDAG; RegisterSDNode(unsigned reg, EVT VT) : SDNode(ISD::Register, 0, DebugLoc(), getSDVTList(VT)), Reg(reg) { } public: unsigned getReg() const { return Reg; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::Register; } }; class RegisterMaskSDNode : public SDNode { // The memory for RegMask is not owned by the node. const uint32_t *RegMask; friend class SelectionDAG; RegisterMaskSDNode(const uint32_t *mask) : SDNode(ISD::RegisterMask, 0, DebugLoc(), getSDVTList(MVT::Untyped)), RegMask(mask) {} public: const uint32_t *getRegMask() const { return RegMask; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::RegisterMask; } }; class BlockAddressSDNode : public SDNode { const BlockAddress *BA; int64_t Offset; unsigned char TargetFlags; friend class SelectionDAG; BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba, int64_t o, unsigned char Flags) : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)), BA(ba), Offset(o), TargetFlags(Flags) { } public: const BlockAddress *getBlockAddress() const { return BA; } int64_t getOffset() const { return Offset; } unsigned char getTargetFlags() const { return TargetFlags; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::BlockAddress || N->getOpcode() == ISD::TargetBlockAddress; } }; class EHLabelSDNode : public SDNode { SDUse Chain; MCSymbol *Label; friend class SelectionDAG; EHLabelSDNode(unsigned Order, DebugLoc dl, SDValue ch, MCSymbol *L) : SDNode(ISD::EH_LABEL, Order, dl, getSDVTList(MVT::Other)), Label(L) { InitOperands(&Chain, ch); } public: MCSymbol *getLabel() const { return Label; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::EH_LABEL; } }; class ExternalSymbolSDNode : public SDNode { const char *Symbol; unsigned char TargetFlags; friend class SelectionDAG; ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned char TF, EVT VT) : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Sym), TargetFlags(TF) { } public: const char *getSymbol() const { return Symbol; } unsigned char getTargetFlags() const { return TargetFlags; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ExternalSymbol || N->getOpcode() == ISD::TargetExternalSymbol; } }; class MCSymbolSDNode : public SDNode { MCSymbol *Symbol; friend class SelectionDAG; MCSymbolSDNode(MCSymbol *Symbol, EVT VT) : SDNode(ISD::MCSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Symbol) {} public: MCSymbol *getMCSymbol() const { return Symbol; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MCSymbol; } }; class CondCodeSDNode : public SDNode { ISD::CondCode Condition; friend class SelectionDAG; explicit CondCodeSDNode(ISD::CondCode Cond) : SDNode(ISD::CONDCODE, 0, DebugLoc(), getSDVTList(MVT::Other)), Condition(Cond) { } public: ISD::CondCode get() const { return Condition; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::CONDCODE; } }; /// NOTE: avoid using this node as this may disappear in the /// future and most targets don't support it. class CvtRndSatSDNode : public SDNode { ISD::CvtCode CvtCode; friend class SelectionDAG; explicit CvtRndSatSDNode(EVT VT, unsigned Order, DebugLoc dl, ArrayRef Ops, ISD::CvtCode Code) : SDNode(ISD::CONVERT_RNDSAT, Order, dl, getSDVTList(VT), Ops), CvtCode(Code) { assert(Ops.size() == 5 && "wrong number of operations"); } public: ISD::CvtCode getCvtCode() const { return CvtCode; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::CONVERT_RNDSAT; } }; /// This class is used to represent EVT's, which are used /// to parameterize some operations. class VTSDNode : public SDNode { EVT ValueType; friend class SelectionDAG; explicit VTSDNode(EVT VT) : SDNode(ISD::VALUETYPE, 0, DebugLoc(), getSDVTList(MVT::Other)), ValueType(VT) { } public: EVT getVT() const { return ValueType; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::VALUETYPE; } }; /// Base class for LoadSDNode and StoreSDNode class LSBaseSDNode : public MemSDNode { //! Operand array for load and store /*! \note Moving this array to the base class captures more common functionality shared between LoadSDNode and StoreSDNode */ SDUse Ops[4]; public: LSBaseSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, SDValue *Operands, unsigned numOperands, SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { SubclassData |= AM << 2; assert(getAddressingMode() == AM && "MemIndexedMode encoding error!"); InitOperands(Ops, Operands, numOperands); assert((getOffset().getOpcode() == ISD::UNDEF || isIndexed()) && "Only indexed loads and stores have a non-undef offset operand"); } const SDValue &getOffset() const { return getOperand(getOpcode() == ISD::LOAD ? 2 : 3); } /// Return the addressing mode for this load or store: /// unindexed, pre-inc, pre-dec, post-inc, or post-dec. ISD::MemIndexedMode getAddressingMode() const { return ISD::MemIndexedMode((SubclassData >> 2) & 7); } /// Return true if this is a pre/post inc/dec load/store. bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; } /// Return true if this is NOT a pre/post inc/dec load/store. bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::LOAD || N->getOpcode() == ISD::STORE; } }; /// This class is used to represent ISD::LOAD nodes. class LoadSDNode : public LSBaseSDNode { friend class SelectionDAG; LoadSDNode(SDValue *ChainPtrOff, unsigned Order, DebugLoc dl, SDVTList VTs, ISD::MemIndexedMode AM, ISD::LoadExtType ETy, EVT MemVT, MachineMemOperand *MMO) : LSBaseSDNode(ISD::LOAD, Order, dl, ChainPtrOff, 3, VTs, AM, MemVT, MMO) { SubclassData |= (unsigned short)ETy; assert(getExtensionType() == ETy && "LoadExtType encoding error!"); assert(readMem() && "Load MachineMemOperand is not a load!"); assert(!writeMem() && "Load MachineMemOperand is a store!"); } public: /// Return whether this is a plain node, /// or one of the varieties of value-extending loads. ISD::LoadExtType getExtensionType() const { return ISD::LoadExtType(SubclassData & 3); } const SDValue &getBasePtr() const { return getOperand(1); } const SDValue &getOffset() const { return getOperand(2); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::LOAD; } }; /// This class is used to represent ISD::STORE nodes. class StoreSDNode : public LSBaseSDNode { friend class SelectionDAG; StoreSDNode(SDValue *ChainValuePtrOff, unsigned Order, DebugLoc dl, SDVTList VTs, ISD::MemIndexedMode AM, bool isTrunc, EVT MemVT, MachineMemOperand *MMO) : LSBaseSDNode(ISD::STORE, Order, dl, ChainValuePtrOff, 4, VTs, AM, MemVT, MMO) { SubclassData |= (unsigned short)isTrunc; assert(isTruncatingStore() == isTrunc && "isTrunc encoding error!"); assert(!readMem() && "Store MachineMemOperand is a load!"); assert(writeMem() && "Store MachineMemOperand is not a store!"); } public: /// Return true if the op does a truncation before store. /// For integers this is the same as doing a TRUNCATE and storing the result. /// For floats, it is the same as doing an FP_ROUND and storing the result. bool isTruncatingStore() const { return SubclassData & 1; } const SDValue &getValue() const { return getOperand(1); } const SDValue &getBasePtr() const { return getOperand(2); } const SDValue &getOffset() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::STORE; } }; /// This base class is used to represent MLOAD and MSTORE nodes class MaskedLoadStoreSDNode : public MemSDNode { // Operands SDUse Ops[4]; public: friend class SelectionDAG; MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, SDValue *Operands, unsigned numOperands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { InitOperands(Ops, Operands, numOperands); } // In the both nodes address is Op1, mask is Op2: // MaskedLoadSDNode (Chain, ptr, mask, src0), src0 is a passthru value // MaskedStoreSDNode (Chain, ptr, mask, data) // Mask is a vector of i1 elements const SDValue &getBasePtr() const { return getOperand(1); } const SDValue &getMask() const { return getOperand(2); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MLOAD || N->getOpcode() == ISD::MSTORE; } }; /// This class is used to represent an MLOAD node class MaskedLoadSDNode : public MaskedLoadStoreSDNode { public: friend class SelectionDAG; MaskedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands, unsigned numOperands, SDVTList VTs, ISD::LoadExtType ETy, EVT MemVT, MachineMemOperand *MMO) : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands, VTs, MemVT, MMO) { SubclassData |= (unsigned short)ETy; } ISD::LoadExtType getExtensionType() const { return ISD::LoadExtType(SubclassData & 3); } const SDValue &getSrc0() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MLOAD; } }; /// This class is used to represent an MSTORE node class MaskedStoreSDNode : public MaskedLoadStoreSDNode { public: friend class SelectionDAG; MaskedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands, unsigned numOperands, SDVTList VTs, bool isTrunc, EVT MemVT, MachineMemOperand *MMO) : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands, VTs, MemVT, MMO) { SubclassData |= (unsigned short)isTrunc; } /// Return true if the op does a truncation before store. /// For integers this is the same as doing a TRUNCATE and storing the result. /// For floats, it is the same as doing an FP_ROUND and storing the result. bool isTruncatingStore() const { return SubclassData & 1; } const SDValue &getValue() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MSTORE; } }; /// This is a base class used to represent /// MGATHER and MSCATTER nodes /// class MaskedGatherScatterSDNode : public MemSDNode { // Operands SDUse Ops[5]; public: friend class SelectionDAG; MaskedGatherScatterSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, ArrayRef Operands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { assert(Operands.size() == 5 && "Incompatible number of operands"); InitOperands(Ops, Operands.data(), Operands.size()); } // In the both nodes address is Op1, mask is Op2: // MaskedGatherSDNode (Chain, src0, mask, base, index), src0 is a passthru value // MaskedScatterSDNode (Chain, value, mask, base, index) // Mask is a vector of i1 elements const SDValue &getBasePtr() const { return getOperand(3); } const SDValue &getIndex() const { return getOperand(4); } const SDValue &getMask() const { return getOperand(2); } const SDValue &getValue() const { return getOperand(1); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MGATHER || N->getOpcode() == ISD::MSCATTER; } }; /// This class is used to represent an MGATHER node /// class MaskedGatherSDNode : public MaskedGatherScatterSDNode { public: friend class SelectionDAG; MaskedGatherSDNode(unsigned Order, DebugLoc dl, ArrayRef Operands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, Operands, VTs, MemVT, MMO) { assert(getValue().getValueType() == getValueType(0) && "Incompatible type of the PassThru value in MaskedGatherSDNode"); assert(getMask().getValueType().getVectorNumElements() == getValueType(0).getVectorNumElements() && "Vector width mismatch between mask and data"); assert(getIndex().getValueType().getVectorNumElements() == getValueType(0).getVectorNumElements() && "Vector width mismatch between index and data"); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MGATHER; } }; /// This class is used to represent an MSCATTER node /// class MaskedScatterSDNode : public MaskedGatherScatterSDNode { public: friend class SelectionDAG; MaskedScatterSDNode(unsigned Order, DebugLoc dl,ArrayRef Operands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, MemVT, MMO) { assert(getMask().getValueType().getVectorNumElements() == getValue().getValueType().getVectorNumElements() && "Vector width mismatch between mask and data"); assert(getIndex().getValueType().getVectorNumElements() == getValue().getValueType().getVectorNumElements() && "Vector width mismatch between index and data"); } static bool classof(const SDNode *N) { return N->getOpcode() == ISD::MSCATTER; } }; /// An SDNode that represents everything that will be needed /// to construct a MachineInstr. These nodes are created during the /// instruction selection proper phase. class MachineSDNode : public SDNode { public: typedef MachineMemOperand **mmo_iterator; private: friend class SelectionDAG; MachineSDNode(unsigned Opc, unsigned Order, const DebugLoc DL, SDVTList VTs) : SDNode(Opc, Order, DL, VTs), MemRefs(nullptr), MemRefsEnd(nullptr) {} /// Operands for this instruction, if they fit here. If /// they don't, this field is unused. SDUse LocalOperands[4]; /// Memory reference descriptions for this instruction. mmo_iterator MemRefs; mmo_iterator MemRefsEnd; public: mmo_iterator memoperands_begin() const { return MemRefs; } mmo_iterator memoperands_end() const { return MemRefsEnd; } bool memoperands_empty() const { return MemRefsEnd == MemRefs; } /// Assign this MachineSDNodes's memory reference descriptor /// list. This does not transfer ownership. void setMemRefs(mmo_iterator NewMemRefs, mmo_iterator NewMemRefsEnd) { for (mmo_iterator MMI = NewMemRefs, MME = NewMemRefsEnd; MMI != MME; ++MMI) assert(*MMI && "Null mem ref detected!"); MemRefs = NewMemRefs; MemRefsEnd = NewMemRefsEnd; } static bool classof(const SDNode *N) { return N->isMachineOpcode(); } }; class SDNodeIterator : public std::iterator { const SDNode *Node; unsigned Operand; SDNodeIterator(const SDNode *N, unsigned Op) : Node(N), Operand(Op) {} public: bool operator==(const SDNodeIterator& x) const { return Operand == x.Operand; } bool operator!=(const SDNodeIterator& x) const { return !operator==(x); } pointer operator*() const { return Node->getOperand(Operand).getNode(); } pointer operator->() const { return operator*(); } SDNodeIterator& operator++() { // Preincrement ++Operand; return *this; } SDNodeIterator operator++(int) { // Postincrement SDNodeIterator tmp = *this; ++*this; return tmp; } size_t operator-(SDNodeIterator Other) const { assert(Node == Other.Node && "Cannot compare iterators of two different nodes!"); return Operand - Other.Operand; } static SDNodeIterator begin(const SDNode *N) { return SDNodeIterator(N, 0); } static SDNodeIterator end (const SDNode *N) { return SDNodeIterator(N, N->getNumOperands()); } unsigned getOperand() const { return Operand; } const SDNode *getNode() const { return Node; } }; template <> struct GraphTraits { typedef SDNode NodeType; typedef SDNodeIterator ChildIteratorType; static inline NodeType *getEntryNode(SDNode *N) { return N; } static inline ChildIteratorType child_begin(NodeType *N) { return SDNodeIterator::begin(N); } static inline ChildIteratorType child_end(NodeType *N) { return SDNodeIterator::end(N); } }; /// The largest SDNode class. typedef MaskedGatherScatterSDNode LargestSDNode; /// The SDNode class with the greatest alignment requirement. typedef GlobalAddressSDNode MostAlignedSDNode; namespace ISD { /// Returns true if the specified node is a non-extending and unindexed load. inline bool isNormalLoad(const SDNode *N) { const LoadSDNode *Ld = dyn_cast(N); return Ld && Ld->getExtensionType() == ISD::NON_EXTLOAD && Ld->getAddressingMode() == ISD::UNINDEXED; } /// Returns true if the specified node is a non-extending load. inline bool isNON_EXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::NON_EXTLOAD; } /// Returns true if the specified node is a EXTLOAD. inline bool isEXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::EXTLOAD; } /// Returns true if the specified node is a SEXTLOAD. inline bool isSEXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::SEXTLOAD; } /// Returns true if the specified node is a ZEXTLOAD. inline bool isZEXTLoad(const SDNode *N) { return isa(N) && cast(N)->getExtensionType() == ISD::ZEXTLOAD; } /// Returns true if the specified node is an unindexed load. inline bool isUNINDEXEDLoad(const SDNode *N) { return isa(N) && cast(N)->getAddressingMode() == ISD::UNINDEXED; } /// Returns true if the specified node is a non-truncating /// and unindexed store. inline bool isNormalStore(const SDNode *N) { const StoreSDNode *St = dyn_cast(N); return St && !St->isTruncatingStore() && St->getAddressingMode() == ISD::UNINDEXED; } /// Returns true if the specified node is a non-truncating store. inline bool isNON_TRUNCStore(const SDNode *N) { return isa(N) && !cast(N)->isTruncatingStore(); } /// Returns true if the specified node is a truncating store. inline bool isTRUNCStore(const SDNode *N) { return isa(N) && cast(N)->isTruncatingStore(); } /// Returns true if the specified node is an unindexed store. inline bool isUNINDEXEDStore(const SDNode *N) { return isa(N) && cast(N)->getAddressingMode() == ISD::UNINDEXED; } } } // end llvm namespace #endif Index: projects/clang380-import/contrib/llvm/include/llvm/Transforms/Utils/Local.h =================================================================== --- projects/clang380-import/contrib/llvm/include/llvm/Transforms/Utils/Local.h (revision 294608) +++ projects/clang380-import/contrib/llvm/include/llvm/Transforms/Utils/Local.h (revision 294609) @@ -1,336 +1,355 @@ //===-- Local.h - Functions to perform local transformations ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This family of functions perform various local transformations to the // program. // //===----------------------------------------------------------------------===// #ifndef LLVM_TRANSFORMS_UTILS_LOCAL_H #define LLVM_TRANSFORMS_UTILS_LOCAL_H #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Operator.h" namespace llvm { class User; class BasicBlock; class Function; class BranchInst; class Instruction; class DbgDeclareInst; class StoreInst; class LoadInst; class Value; class PHINode; class AllocaInst; class AssumptionCache; class ConstantExpr; class DataLayout; class TargetLibraryInfo; class TargetTransformInfo; class DIBuilder; class DominatorTree; class LazyValueInfo; template class SmallVectorImpl; //===----------------------------------------------------------------------===// // Local constant propagation. // /// ConstantFoldTerminator - If a terminator instruction is predicated on a /// constant value, convert it into an unconditional branch to the constant /// destination. This is a nontrivial operation because the successors of this /// basic block must have their PHI nodes updated. /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch /// conditions and indirectbr addresses this might make dead if /// DeleteDeadConditions is true. bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false, const TargetLibraryInfo *TLI = nullptr); //===----------------------------------------------------------------------===// // Local dead code elimination. // /// isInstructionTriviallyDead - Return true if the result produced by the /// instruction is not used, and the instruction has no side effects. /// bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI = nullptr); /// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a /// trivially dead instruction, delete it. If that makes any of its operands /// trivially dead, delete them too, recursively. Return true if any /// instructions were deleted. bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI = nullptr); /// RecursivelyDeleteDeadPHINode - If the specified value is an effectively /// dead PHI node, due to being a def-use chain of single-use nodes that /// either forms a cycle or is terminated by a trivially dead instruction, /// delete it. If that makes any of its operands trivially dead, delete them /// too, recursively. Return true if a change was made. bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI = nullptr); /// SimplifyInstructionsInBlock - Scan the specified basic block and try to /// simplify any instructions in it and recursively delete dead instructions. /// /// This returns true if it changed the code, note that it can delete /// instructions in other blocks as well in this block. bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr); //===----------------------------------------------------------------------===// // Control Flow Graph Restructuring. // /// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this /// method is called when we're about to delete Pred as a predecessor of BB. If /// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred. /// /// Unlike the removePredecessor method, this attempts to simplify uses of PHI /// nodes that collapse into identity values. For example, if we have: /// x = phi(1, 0, 0, 0) /// y = and x, z /// /// .. and delete the predecessor corresponding to the '1', this will attempt to /// recursively fold the 'and' to 0. void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred); /// MergeBasicBlockIntoOnlyPred - BB is a block with one predecessor and its /// predecessor is known to have one successor (BB!). Eliminate the edge /// between them, moving the instructions in the predecessor into BB. This /// deletes the predecessor block. /// void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, DominatorTree *DT = nullptr); /// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an /// unconditional branch, and contains no instructions other than PHI nodes, /// potential debug intrinsics and the branch. If possible, eliminate BB by /// rewriting all the predecessors to branch to the successor block and return /// true. If we can't transform, return false. bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB); /// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI /// nodes in this block. This doesn't try to be clever about PHI nodes /// which differ only in the order of the incoming values, but instcombine /// orders them so it usually won't matter. /// bool EliminateDuplicatePHINodes(BasicBlock *BB); /// SimplifyCFG - This function is used to do simplification of a CFG. For /// example, it adjusts branches to branches to eliminate the extra hop, it /// eliminates unreachable basic blocks, and does other "peephole" optimization /// of the CFG. It returns true if a modification was made, possibly deleting /// the basic block that was pointed to. /// bool SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, unsigned BonusInstThreshold, AssumptionCache *AC = nullptr); /// FlatternCFG - This function is used to flatten a CFG. For /// example, it uses parallel-and and parallel-or mode to collapse // if-conditions and merge if-regions with identical statements. /// bool FlattenCFG(BasicBlock *BB, AliasAnalysis *AA = nullptr); /// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch, /// and if a predecessor branches to us and one of our successors, fold the /// setcc into the predecessor and use logical operations to pick the right /// destination. bool FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold = 1); /// DemoteRegToStack - This function takes a virtual register computed by an /// Instruction and replaces it with a slot in the stack frame, allocated via /// alloca. This allows the CFG to be changed around without fear of /// invalidating the SSA information for the value. It returns the pointer to /// the alloca inserted to create a stack slot for X. /// AllocaInst *DemoteRegToStack(Instruction &X, bool VolatileLoads = false, Instruction *AllocaPoint = nullptr); /// DemotePHIToStack - This function takes a virtual register computed by a phi /// node and replaces it with a slot in the stack frame, allocated via alloca. /// The phi node is deleted and it returns the pointer to the alloca inserted. AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr); /// getOrEnforceKnownAlignment - If the specified pointer has an alignment that /// we can determine, return it, otherwise return 0. If PrefAlign is specified, /// and it is more than the alignment of the ultimate object, see if we can /// increase the alignment of the ultimate object, making this check succeed. unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI = nullptr, AssumptionCache *AC = nullptr, const DominatorTree *DT = nullptr); /// getKnownAlignment - Try to infer an alignment for the specified pointer. static inline unsigned getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI = nullptr, AssumptionCache *AC = nullptr, const DominatorTree *DT = nullptr) { return getOrEnforceKnownAlignment(V, 0, DL, CxtI, AC, DT); } /// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the /// code necessary to compute the offset from the base pointer (without adding /// in the base pointer). Return the result as a signed integer of intptr size. /// When NoAssumptions is true, no assumptions about index computation not /// overflowing is made. template Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP, bool NoAssumptions = false) { GEPOperator *GEPOp = cast(GEP); Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); Value *Result = Constant::getNullValue(IntPtrTy); // If the GEP is inbounds, we know that none of the addressing operations will // overflow in an unsigned sense. bool isInBounds = GEPOp->isInBounds() && !NoAssumptions; // Build a mask for high order bits. unsigned IntPtrWidth = IntPtrTy->getScalarType()->getIntegerBitWidth(); uint64_t PtrSizeMask = ~0ULL >> (64 - IntPtrWidth); gep_type_iterator GTI = gep_type_begin(GEP); for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e; ++i, ++GTI) { Value *Op = *i; uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask; if (Constant *OpC = dyn_cast(Op)) { if (OpC->isZeroValue()) continue; // Handle a struct index, which adds its field offset to the pointer. if (StructType *STy = dyn_cast(*GTI)) { if (OpC->getType()->isVectorTy()) OpC = OpC->getSplatValue(); uint64_t OpValue = cast(OpC)->getZExtValue(); Size = DL.getStructLayout(STy)->getElementOffset(OpValue); if (Size) Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size), GEP->getName()+".offs"); continue; } Constant *Scale = ConstantInt::get(IntPtrTy, Size); Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/); Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/); // Emit an add instruction. Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs"); continue; } // Convert to correct type. if (Op->getType() != IntPtrTy) Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c"); if (Size != 1) { // We'll let instcombine(mul) convert this to a shl if possible. Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size), GEP->getName()+".idx", isInBounds /*NUW*/); } // Emit an add instruction. Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs"); } return Result; } ///===---------------------------------------------------------------------===// /// Dbg Intrinsic utilities /// /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value /// that has an associated llvm.dbg.decl intrinsic. bool ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI, DIBuilder &Builder); /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value /// that has an associated llvm.dbg.decl intrinsic. bool ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, LoadInst *LI, DIBuilder &Builder); /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set /// of llvm.dbg.value intrinsics. bool LowerDbgDeclare(Function &F); /// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic corresponding to /// an alloca, if any. DbgDeclareInst *FindAllocaDbgDeclare(Value *V); /// \brief Replaces llvm.dbg.declare instruction when the address it describes /// is replaced with a new value. If Deref is true, an additional DW_OP_deref is /// prepended to the expression. If Offset is non-zero, a constant displacement /// is added to the expression (after the optional Deref). Offset can be /// negative. bool replaceDbgDeclare(Value *Address, Value *NewAddress, Instruction *InsertBefore, DIBuilder &Builder, bool Deref, int Offset); /// \brief Replaces llvm.dbg.declare instruction when the alloca it describes /// is replaced with a new value. If Deref is true, an additional DW_OP_deref is /// prepended to the expression. If Offset is non-zero, a constant displacement /// is added to the expression (after the optional Deref). Offset can be /// negative. New llvm.dbg.declare is inserted immediately before AI. bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, bool Deref, int Offset = 0); /// \brief Insert an unreachable instruction before the specified /// instruction, making it and the rest of the code in the block dead. void changeToUnreachable(Instruction *I, bool UseLLVMTrap); /// Replace 'BB's terminator with one that does not have an unwind successor /// block. Rewrites `invoke` to `call`, etc. Updates any PHIs in unwind /// successor. /// /// \param BB Block whose terminator will be replaced. Its terminator must /// have an unwind successor. void removeUnwindEdge(BasicBlock *BB); /// \brief Remove all blocks that can not be reached from the function's entry. /// /// Returns true if any basic block was removed. bool removeUnreachableBlocks(Function &F, LazyValueInfo *LVI = nullptr); /// \brief Combine the metadata of two instructions so that K can replace J /// /// Metadata not listed as known via KnownIDs is removed void combineMetadata(Instruction *K, const Instruction *J, ArrayRef KnownIDs); /// \brief Replace each use of 'From' with 'To' if that use is dominated by /// the given edge. Returns the number of replacements made. unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Edge); /// \brief Replace each use of 'From' with 'To' if that use is dominated by /// the given BasicBlock. Returns the number of replacements made. unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlock *BB); /// \brief Return true if the CallSite CS calls a gc leaf function. /// /// A leaf function is a function that does not safepoint the thread during its /// execution. During a call or invoke to such a function, the callers stack /// does not have to be made parseable. /// /// Most passes can and should ignore this information, and it is only used /// during lowering by the GC infrastructure. bool callsGCLeafFunction(ImmutableCallSite CS); +//===----------------------------------------------------------------------===// +// Intrinsic pattern matching +// + +/// Try and match a bitreverse or bswap idiom. +/// +/// If an idiom is matched, an intrinsic call is inserted before \c I. Any added +/// instructions are returned in \c InsertedInsts. They will all have been added +/// to a basic block. +/// +/// A bitreverse idiom normally requires around 2*BW nodes to be searched (where +/// BW is the bitwidth of the integer type). A bswap idiom requires anywhere up +/// to BW / 4 nodes to be searched, so is significantly faster. +/// +/// This function returns true on a successful match or false otherwise. +bool recognizeBitReverseOrBSwapIdiom( + Instruction *I, bool MatchBSwaps, bool MatchBitReversals, + SmallVectorImpl &InsertedInsts); + } // End llvm namespace #endif Index: projects/clang380-import/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h =================================================================== --- projects/clang380-import/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h (revision 294608) +++ projects/clang380-import/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h (revision 294609) @@ -1,174 +1,172 @@ //===- SimplifyLibCalls.h - Library call simplifier -------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file exposes an interface to build some C language libcalls for // optimization passes that need to call the various functions. // //===----------------------------------------------------------------------===// #ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H #define LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/IRBuilder.h" namespace llvm { class Value; class CallInst; class DataLayout; class Instruction; class TargetLibraryInfo; class BasicBlock; class Function; /// \brief This class implements simplifications for calls to fortified library /// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to, /// when possible, replace them with their non-checking counterparts. /// Other optimizations can also be done, but it's possible to disable them and /// only simplify needless use of the checking versions (when the object size /// is unknown) by passing true for OnlyLowerUnknownSize. class FortifiedLibCallSimplifier { private: const TargetLibraryInfo *TLI; bool OnlyLowerUnknownSize; public: FortifiedLibCallSimplifier(const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize = false); /// \brief Take the given call instruction and return a more /// optimal value to replace the instruction with or 0 if a more /// optimal form can't be found. /// The call must not be an indirect call. Value *optimizeCall(CallInst *CI); private: Value *optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B); Value *optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B); Value *optimizeMemSetChk(CallInst *CI, IRBuilder<> &B); // Str/Stp cpy are similar enough to be handled in the same functions. Value *optimizeStrpCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func); Value *optimizeStrpNCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func); /// \brief Checks whether the call \p CI to a fortified libcall is foldable /// to the non-fortified version. bool isFortifiedCallFoldable(CallInst *CI, unsigned ObjSizeOp, unsigned SizeOp, bool isString); }; /// LibCallSimplifier - This class implements a collection of optimizations /// that replace well formed calls to library functions with a more optimal /// form. For example, replacing 'printf("Hello!")' with 'puts("Hello!")'. class LibCallSimplifier { private: FortifiedLibCallSimplifier FortifiedSimplifier; const DataLayout &DL; const TargetLibraryInfo *TLI; bool UnsafeFPShrink; function_ref Replacer; /// \brief Internal wrapper for RAUW that is the default implementation. /// /// Other users may provide an alternate function with this signature instead /// of this one. static void replaceAllUsesWithDefault(Instruction *I, Value *With); /// \brief Replace an instruction's uses with a value using our replacer. void replaceAllUsesWith(Instruction *I, Value *With); public: LibCallSimplifier(const DataLayout &DL, const TargetLibraryInfo *TLI, function_ref Replacer = &replaceAllUsesWithDefault); /// optimizeCall - Take the given call instruction and return a more /// optimal value to replace the instruction with or 0 if a more /// optimal form can't be found. Note that the returned value may /// be equal to the instruction being optimized. In this case all /// other instructions that use the given instruction were modified /// and the given instruction is dead. /// The call must not be an indirect call. Value *optimizeCall(CallInst *CI); private: // String and Memory Library Call Optimizations Value *optimizeStrCat(CallInst *CI, IRBuilder<> &B); Value *optimizeStrNCat(CallInst *CI, IRBuilder<> &B); Value *optimizeStrChr(CallInst *CI, IRBuilder<> &B); Value *optimizeStrRChr(CallInst *CI, IRBuilder<> &B); Value *optimizeStrCmp(CallInst *CI, IRBuilder<> &B); Value *optimizeStrNCmp(CallInst *CI, IRBuilder<> &B); Value *optimizeStrCpy(CallInst *CI, IRBuilder<> &B); Value *optimizeStpCpy(CallInst *CI, IRBuilder<> &B); Value *optimizeStrNCpy(CallInst *CI, IRBuilder<> &B); Value *optimizeStrLen(CallInst *CI, IRBuilder<> &B); Value *optimizeStrPBrk(CallInst *CI, IRBuilder<> &B); Value *optimizeStrTo(CallInst *CI, IRBuilder<> &B); Value *optimizeStrSpn(CallInst *CI, IRBuilder<> &B); Value *optimizeStrCSpn(CallInst *CI, IRBuilder<> &B); Value *optimizeStrStr(CallInst *CI, IRBuilder<> &B); Value *optimizeMemChr(CallInst *CI, IRBuilder<> &B); Value *optimizeMemCmp(CallInst *CI, IRBuilder<> &B); Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B); Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B); Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B); // Wrapper for all String/Memory Library Call Optimizations Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B); // Math Library Optimizations - Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType); - Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B); Value *optimizeCos(CallInst *CI, IRBuilder<> &B); Value *optimizePow(CallInst *CI, IRBuilder<> &B); Value *optimizeExp2(CallInst *CI, IRBuilder<> &B); Value *optimizeFabs(CallInst *CI, IRBuilder<> &B); Value *optimizeFMinFMax(CallInst *CI, IRBuilder<> &B); Value *optimizeLog(CallInst *CI, IRBuilder<> &B); Value *optimizeSqrt(CallInst *CI, IRBuilder<> &B); Value *optimizeSinCosPi(CallInst *CI, IRBuilder<> &B); Value *optimizeTan(CallInst *CI, IRBuilder<> &B); // Integer Library Call Optimizations Value *optimizeFFS(CallInst *CI, IRBuilder<> &B); Value *optimizeAbs(CallInst *CI, IRBuilder<> &B); Value *optimizeIsDigit(CallInst *CI, IRBuilder<> &B); Value *optimizeIsAscii(CallInst *CI, IRBuilder<> &B); Value *optimizeToAscii(CallInst *CI, IRBuilder<> &B); // Formatting and IO Library Call Optimizations Value *optimizeErrorReporting(CallInst *CI, IRBuilder<> &B, int StreamArg = -1); Value *optimizePrintF(CallInst *CI, IRBuilder<> &B); Value *optimizeSPrintF(CallInst *CI, IRBuilder<> &B); Value *optimizeFPrintF(CallInst *CI, IRBuilder<> &B); Value *optimizeFWrite(CallInst *CI, IRBuilder<> &B); Value *optimizeFPuts(CallInst *CI, IRBuilder<> &B); Value *optimizePuts(CallInst *CI, IRBuilder<> &B); // Helper methods Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B); void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, SmallVectorImpl &SinCalls, SmallVectorImpl &CosCalls, SmallVectorImpl &SinCosCalls); void replaceTrigInsts(SmallVectorImpl &Calls, Value *Res); Value *optimizePrintFString(CallInst *CI, IRBuilder<> &B); Value *optimizeSPrintFString(CallInst *CI, IRBuilder<> &B); Value *optimizeFPrintFString(CallInst *CI, IRBuilder<> &B); /// hasFloatVersion - Checks if there is a float version of the specified /// function by checking for an existing function with name FuncName + f bool hasFloatVersion(StringRef FuncName); }; } // End llvm namespace #endif Index: projects/clang380-import/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp (revision 294609) @@ -1,5572 +1,5601 @@ //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass munges the code in the input function to better prepare it for // SelectionDAG-based code generation. This works around limitations in it's // basic-block-at-a-time approach. It should eventually be removed. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "codegenprepare" STATISTIC(NumBlocksElim, "Number of blocks eliminated"); STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " "sunken Cmps"); STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " "of sunken Casts"); STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); STATISTIC(NumAndsAdded, "Number of and mask instructions added to form ext loads"); STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); static cl::opt DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false), cl::desc("Disable GC optimizations in CodeGenPrepare")); static cl::opt DisableSelectToBranch( "disable-cgp-select2branch", cl::Hidden, cl::init(false), cl::desc("Disable select to branch conversion.")); static cl::opt AddrSinkUsingGEPs( "addr-sink-using-gep", cl::Hidden, cl::init(false), cl::desc("Address sinking in CGP using GEPs.")); static cl::opt EnableAndCmpSinking( "enable-andcmp-sinking", cl::Hidden, cl::init(true), cl::desc("Enable sinkinig and/cmp into branches.")); static cl::opt DisableStoreExtract( "disable-cgp-store-extract", cl::Hidden, cl::init(false), cl::desc("Disable store(extract) optimizations in CodeGenPrepare")); static cl::opt StressStoreExtract( "stress-cgp-store-extract", cl::Hidden, cl::init(false), cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); static cl::opt DisableExtLdPromotion( "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in " "CodeGenPrepare")); static cl::opt StressExtLdPromotion( "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) " "optimization in CodeGenPrepare")); namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; typedef DenseMap InstrToOrigTy; class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { const TargetMachine *TM; const TargetLowering *TLI; const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; /// As we scan instructions optimizing them, this is the next instruction /// to optimize. Transforms that can invalidate this should update it. BasicBlock::iterator CurInstIterator; /// Keeps track of non-local addresses that have been sunk into a block. /// This allows us to avoid inserting duplicate code for blocks with /// multiple load/stores of the same address. ValueMap SunkAddrs; /// Keeps track of all instructions inserted for the current function. SetOfInstrs InsertedInsts; /// Keeps track of the type of the related instruction before their /// promotion for the current function. InstrToOrigTy PromotedInsts; /// True if CFG is modified in any way. bool ModifiedDT; /// True if optimizing for size. bool OptSize; /// DataLayout for the Function being processed. const DataLayout *DL; public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; const char *getPassName() const override { return "CodeGen Prepare"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); AU.addRequired(); AU.addRequired(); } private: bool eliminateFallThrough(Function &F); bool eliminateMostlyEmptyBlocks(Function &F); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void eliminateMostlyEmptyBlock(BasicBlock *BB); bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); bool optimizeInst(Instruction *I, bool& ModifiedDT); bool optimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy, unsigned AS); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); bool moveExtToFormExtLoad(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); bool optimizeSwitchInst(SwitchInst *CI); bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); bool sinkAndCmp(Function &F); bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, Instruction *&Inst, const SmallVectorImpl &Exts, unsigned CreatedInstCost); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); void stripInvariantGroupMetadata(Instruction &I); }; } char CodeGenPrepare::ID = 0; INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare", "Optimize for code generation", false, false) FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { return new CodeGenPrepare(TM); } bool CodeGenPrepare::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; DL = &F.getParent()->getDataLayout(); bool EverMadeChange = false; // Clear per function information. InsertedInsts.clear(); PromotedInsts.clear(); ModifiedDT = false; if (TM) TLI = TM->getSubtargetImpl(F)->getTargetLowering(); TLInfo = &getAnalysis().getTLI(); TTI = &getAnalysis().getTTI(F); OptSize = F.optForSize(); /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && TLI && TLI->isSlowDivBypassed()) { const DenseMap &BypassWidths = TLI->getBypassSlowDivWidths(); BasicBlock* BB = &*F.begin(); while (BB != nullptr) { // bypassSlowDivision may create new BBs, but we don't want to reapply the // optimization to those blocks. BasicBlock* Next = BB->getNextNode(); EverMadeChange |= bypassSlowDivision(BB, BypassWidths); BB = Next; } } // Eliminate blocks that contain only PHI nodes and an // unconditional branch. EverMadeChange |= eliminateMostlyEmptyBlocks(F); // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. EverMadeChange |= placeDbgValues(F); // If there is a mask, compare against zero, and branch that can be combined // into a single target instruction, push the mask and compare into branch // users. Do this before OptimizeBlock -> OptimizeInst -> // OptimizeCmpExpression, which perturbs the pattern being searched for. if (!DisableBranchOpts) { EverMadeChange |= sinkAndCmp(F); EverMadeChange |= splitBranchCondition(F); } bool MadeChange = true; while (MadeChange) { MadeChange = false; for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); // Restart BB iteration if the dominator tree of the Function was changed if (ModifiedDTOnIteration) break; } EverMadeChange |= MadeChange; } SunkAddrs.clear(); if (!DisableBranchOpts) { MadeChange = false; SmallPtrSet WorkList; for (BasicBlock &BB : F) { SmallVector Successors(succ_begin(&BB), succ_end(&BB)); MadeChange |= ConstantFoldTerminator(&BB, true); if (!MadeChange) continue; for (SmallVectorImpl::iterator II = Successors.begin(), IE = Successors.end(); II != IE; ++II) if (pred_begin(*II) == pred_end(*II)) WorkList.insert(*II); } // Delete the dead blocks and any of their dead successors. MadeChange |= !WorkList.empty(); while (!WorkList.empty()) { BasicBlock *BB = *WorkList.begin(); WorkList.erase(BB); SmallVector Successors(succ_begin(BB), succ_end(BB)); DeleteDeadBlock(BB); for (SmallVectorImpl::iterator II = Successors.begin(), IE = Successors.end(); II != IE; ++II) if (pred_begin(*II) == pred_end(*II)) WorkList.insert(*II); } // Merge pairs of basic blocks with unconditional branches, connected by // a single edge. if (EverMadeChange || MadeChange) MadeChange |= eliminateFallThrough(F); EverMadeChange |= MadeChange; } if (!DisableGCOpts) { SmallVector Statepoints; for (BasicBlock &BB : F) for (Instruction &I : BB) if (isStatepoint(I)) Statepoints.push_back(&I); for (auto &I : Statepoints) EverMadeChange |= simplifyOffsetableRelocate(*I); } return EverMadeChange; } /// Merge basic blocks which are connected by a single edge, where one of the /// basic blocks has a single successor pointing to the other basic block, /// which has a single predecessor. bool CodeGenPrepare::eliminateFallThrough(Function &F) { bool Changed = false; // Scan all of the blocks in the function, except for the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { BasicBlock *BB = &*I++; // If the destination block has a single pred, then this is a trivial // edge, just collapse it. BasicBlock *SinglePred = BB->getSinglePredecessor(); // Don't merge if BB's address is taken. if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; BranchInst *Term = dyn_cast(SinglePred->getTerminator()); if (Term && !Term->isConditional()) { Changed = true; DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n"); // Remember if SinglePred was the entry block of the function. // If so, we will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); MergeBasicBlockIntoOnlyPred(BB, nullptr); if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); // We have erased a block. Update the iterator. I = BB->getIterator(); } } return Changed; } /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these /// blocks so we can split them the way we want them. bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { bool MadeChange = false; // Note that this intentionally skips the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { BasicBlock *BB = &*I++; // If this block doesn't end with an uncond branch, ignore it. BranchInst *BI = dyn_cast(BB->getTerminator()); if (!BI || !BI->isUnconditional()) continue; // If the instruction before the branch (skipping debug info) isn't a phi // node, then other stuff is happening here. BasicBlock::iterator BBI = BI->getIterator(); if (BBI != BB->begin()) { --BBI; while (isa(BBI)) { if (BBI == BB->begin()) break; --BBI; } if (!isa(BBI) && !isa(BBI)) continue; } // Do not break infinite loops. BasicBlock *DestBB = BI->getSuccessor(0); if (DestBB == BB) continue; if (!canMergeBlocks(BB, DestBB)) continue; eliminateMostlyEmptyBlock(BB); MadeChange = true; } return MadeChange; } /// Return true if we can merge BB into DestBB if there is a single /// unconditional branch between them, and BB contains no other non-phi /// instructions. bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const { // We only want to eliminate blocks whose phi nodes are used by phi nodes in // the successor. If there are more complex condition (e.g. preheaders), // don't mess around with them. BasicBlock::const_iterator BBI = BB->begin(); while (const PHINode *PN = dyn_cast(BBI++)) { for (const User *U : PN->users()) { const Instruction *UI = cast(U); if (UI->getParent() != DestBB || !isa(UI)) return false; // If User is inside DestBB block and it is a PHINode then check // incoming value. If incoming value is not from BB then this is // a complex condition (e.g. preheaders) we want to avoid here. if (UI->getParent() == DestBB) { if (const PHINode *UPN = dyn_cast(UI)) for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { Instruction *Insn = dyn_cast(UPN->getIncomingValue(I)); if (Insn && Insn->getParent() == BB && Insn->getParent() != UPN->getIncomingBlock(I)) return false; } } } } // If BB and DestBB contain any common predecessors, then the phi nodes in BB // and DestBB may have conflicting incoming values for the block. If so, we // can't merge the block. const PHINode *DestBBPN = dyn_cast(DestBB->begin()); if (!DestBBPN) return true; // no conflict. // Collect the preds of BB. SmallPtrSet BBPreds; if (const PHINode *BBPN = dyn_cast(BB->begin())) { // It is faster to get preds from a PHI than with pred_iterator. for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) BBPreds.insert(BBPN->getIncomingBlock(i)); } else { BBPreds.insert(pred_begin(BB), pred_end(BB)); } // Walk the preds of DestBB. for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = DestBBPN->getIncomingBlock(i); if (BBPreds.count(Pred)) { // Common predecessor? BBI = DestBB->begin(); while (const PHINode *PN = dyn_cast(BBI++)) { const Value *V1 = PN->getIncomingValueForBlock(Pred); const Value *V2 = PN->getIncomingValueForBlock(BB); // If V2 is a phi node in BB, look up what the mapped value will be. if (const PHINode *V2PN = dyn_cast(V2)) if (V2PN->getParent() == BB) V2 = V2PN->getIncomingValueForBlock(Pred); // If there is a conflict, bail out. if (V1 != V2) return false; } } } return true; } /// Eliminate a basic block that has only phi's and an unconditional branch in /// it. void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { BranchInst *BI = cast(BB->getTerminator()); BasicBlock *DestBB = BI->getSuccessor(0); DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB); // If the destination block has a single pred, then this is a trivial edge, // just collapse it. if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { if (SinglePred != DestBB) { // Remember if SinglePred was the entry block of the function. If so, we // will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); MergeBasicBlockIntoOnlyPred(DestBB, nullptr); if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); return; } } // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB // to handle the new incoming edges it is about to have. PHINode *PN; for (BasicBlock::iterator BBI = DestBB->begin(); (PN = dyn_cast(BBI)); ++BBI) { // Remove the incoming value for BB, and remember it. Value *InVal = PN->removeIncomingValue(BB, false); // Two options: either the InVal is a phi node defined in BB or it is some // value that dominates BB. PHINode *InValPhi = dyn_cast(InVal); if (InValPhi && InValPhi->getParent() == BB) { // Add all of the input values of the input PHI as inputs of this phi. for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) PN->addIncoming(InValPhi->getIncomingValue(i), InValPhi->getIncomingBlock(i)); } else { // Otherwise, add one instance of the dominating value for each edge that // we will be adding. if (PHINode *BBPN = dyn_cast(BB->begin())) { for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); } else { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) PN->addIncoming(InVal, *PI); } } } // The PHIs are now updated, change everything that refers to BB to use // DestBB and remove BB. BB->replaceAllUsesWith(DestBB); BB->eraseFromParent(); ++NumBlocksElim; DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); } // Computes a map of base pointer relocation instructions to corresponding // derived pointer relocation instructions given a vector of all relocate calls static void computeBaseDerivedRelocateMap( const SmallVectorImpl &AllRelocateCalls, DenseMap> &RelocateInstMap) { // Collect information in two maps: one primarily for locating the base object // while filling the second map; the second map is the final structure holding // a mapping between Base and corresponding Derived relocate calls DenseMap, GCRelocateInst *> RelocateIdxMap; for (auto *ThisRelocate : AllRelocateCalls) { auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), ThisRelocate->getDerivedPtrIndex()); RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); } for (auto &Item : RelocateIdxMap) { std::pair Key = Item.first; if (Key.first == Key.second) // Base relocation: nothing to insert continue; GCRelocateInst *I = Item.second; auto BaseKey = std::make_pair(Key.first, Key.first); // We're iterating over RelocateIdxMap so we cannot modify it. auto MaybeBase = RelocateIdxMap.find(BaseKey); if (MaybeBase == RelocateIdxMap.end()) // TODO: We might want to insert a new base object relocate and gep off // that, if there are enough derived object relocates. continue; RelocateInstMap[MaybeBase->second].push_back(I); } } // Accepts a GEP and extracts the operands into a vector provided they're all // small integer constants static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, SmallVectorImpl &OffsetV) { for (unsigned i = 1; i < GEP->getNumOperands(); i++) { // Only accept small constant integer operands auto Op = dyn_cast(GEP->getOperand(i)); if (!Op || Op->getZExtValue() > 20) return false; } for (unsigned i = 1; i < GEP->getNumOperands(); i++) OffsetV.push_back(GEP->getOperand(i)); return true; } // Takes a RelocatedBase (base pointer relocation instruction) and Targets to // replace, computes a replacement, and affects it. static bool simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, const SmallVectorImpl &Targets) { bool MadeChange = false; for (GCRelocateInst *ToReplace : Targets) { assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && "Not relocating a derived object of the original base object"); if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { // A duplicate relocate call. TODO: coalesce duplicates. continue; } if (RelocatedBase->getParent() != ToReplace->getParent()) { // Base and derived relocates are in different basic blocks. // In this case transform is only valid when base dominates derived // relocate. However it would be too expensive to check dominance // for each such relocate, so we skip the whole transformation. continue; } Value *Base = ToReplace->getBasePtr(); auto Derived = dyn_cast(ToReplace->getDerivedPtr()); if (!Derived || Derived->getPointerOperand() != Base) continue; SmallVector OffsetV; if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV)) continue; // Create a Builder and replace the target callsite with a gep assert(RelocatedBase->getNextNode() && "Should always have one since it's not a terminator"); // Insert after RelocatedBase IRBuilder<> Builder(RelocatedBase->getNextNode()); Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); // If gc_relocate does not match the actual type, cast it to the right type. // In theory, there must be a bitcast after gc_relocate if the type does not // match, and we should reuse it to get the derived pointer. But it could be // cases like this: // bb1: // ... // %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) // br label %merge // // bb2: // ... // %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) // br label %merge // // merge: // %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ] // %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)* // // In this case, we can not find the bitcast any more. So we insert a new bitcast // no matter there is already one or not. In this way, we can handle all cases, and // the extra bitcast should be optimized away in later passes. Value *ActualRelocatedBase = RelocatedBase; if (RelocatedBase->getType() != Base->getType()) { ActualRelocatedBase = Builder.CreateBitCast(RelocatedBase, Base->getType()); } Value *Replacement = Builder.CreateGEP( Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); Replacement->takeName(ToReplace); // If the newly generated derived pointer's type does not match the original derived // pointer's type, cast the new derived pointer to match it. Same reasoning as above. Value *ActualReplacement = Replacement; if (Replacement->getType() != ToReplace->getType()) { ActualReplacement = Builder.CreateBitCast(Replacement, ToReplace->getType()); } ToReplace->replaceAllUsesWith(ActualReplacement); ToReplace->eraseFromParent(); MadeChange = true; } return MadeChange; } // Turns this: // // %base = ... // %ptr = gep %base + 15 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) // %base' = relocate(%tok, i32 4, i32 4) // %ptr' = relocate(%tok, i32 4, i32 5) // %val = load %ptr' // // into this: // // %base = ... // %ptr = gep %base + 15 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) // %base' = gc.relocate(%tok, i32 4, i32 4) // %ptr' = gep %base' + 15 // %val = load %ptr' bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { bool MadeChange = false; SmallVector AllRelocateCalls; for (auto *U : I.users()) if (GCRelocateInst *Relocate = dyn_cast(U)) // Collect all the relocate calls associated with a statepoint AllRelocateCalls.push_back(Relocate); // We need atleast one base pointer relocation + one derived pointer // relocation to mangle if (AllRelocateCalls.size() < 2) return false; // RelocateInstMap is a mapping from the base relocate instruction to the // corresponding derived relocate instructions DenseMap> RelocateInstMap; computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); if (RelocateInstMap.empty()) return false; for (auto &Item : RelocateInstMap) // Item.first is the RelocatedBase to offset against // Item.second is the vector of Targets to replace MadeChange = simplifyRelocatesOffABase(Item.first, Item.second); return MadeChange; } /// SinkCast - Sink the specified cast instruction into its user blocks static bool SinkCast(CastInst *CI) { BasicBlock *DefBB = CI->getParent(); /// InsertedCasts - Only insert a cast in each block once. DenseMap InsertedCasts; bool MadeChange = false; for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end(); UI != E; ) { Use &TheUse = UI.getUse(); Instruction *User = cast(*UI); // Figure out which BB this cast is used in. For PHI's this is the // appropriate predecessor block. BasicBlock *UserBB = User->getParent(); if (PHINode *PN = dyn_cast(User)) { UserBB = PN->getIncomingBlock(TheUse); } // Preincrement use iterator so we don't invalidate it. ++UI; // If the block selected to receive the cast is an EH pad that does not // allow non-PHI instructions before the terminator, we can't sink the // cast. if (UserBB->getTerminator()->isEHPad()) continue; // If this user is in the same block as the cast, don't change the cast. if (UserBB == DefBB) continue; // If we have already inserted a cast into this block, use it. CastInst *&InsertedCast = InsertedCasts[UserBB]; if (!InsertedCast) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); assert(InsertPt != UserBB->end()); InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", &*InsertPt); } // Replace a use of the cast with a use of the new cast. TheUse = InsertedCast; MadeChange = true; ++NumCastUses; } // If we removed all uses, nuke the cast. if (CI->use_empty()) { CI->eraseFromParent(); MadeChange = true; } return MadeChange; } /// If the specified cast instruction is a noop copy (e.g. it's casting from /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to /// reduce the number of virtual registers that must be created and coalesced. /// /// Return true if any changes are made. /// static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, const DataLayout &DL) { // If this is a noop copy, EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(DL, CI->getType()); // This is an fp<->int conversion? if (SrcVT.isInteger() != DstVT.isInteger()) return false; // If this is an extension, it will be a zero or sign extension, which // isn't a noop. if (SrcVT.bitsLT(DstVT)) return false; // If these values will be promoted, find out what they will be promoted // to. This helps us consider truncates on PPC as noop copies when they // are. if (TLI.getTypeAction(CI->getContext(), SrcVT) == TargetLowering::TypePromoteInteger) SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); if (TLI.getTypeAction(CI->getContext(), DstVT) == TargetLowering::TypePromoteInteger) DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); // If, after promotion, these are the same types, this is a noop copy. if (SrcVT != DstVT) return false; return SinkCast(CI); } /// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if /// possible. /// /// Return true if any changes were made. static bool CombineUAddWithOverflow(CmpInst *CI) { Value *A, *B; Instruction *AddI; if (!match(CI, m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI)))) return false; Type *Ty = AddI->getType(); if (!isa(Ty)) return false; // We don't want to move around uses of condition values this late, so we we // check if it is legal to create the call to the intrinsic in the basic // block containing the icmp: if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse()) return false; #ifndef NDEBUG // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption // for now: if (AddI->hasOneUse()) assert(*AddI->user_begin() == CI && "expected!"); #endif Module *M = CI->getModule(); Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty); auto *InsertPt = AddI->hasOneUse() ? CI : AddI; auto *UAddWithOverflow = CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt); auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt); auto *Overflow = ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt); CI->replaceAllUsesWith(Overflow); AddI->replaceAllUsesWith(UAdd); CI->eraseFromParent(); AddI->eraseFromParent(); return true; } /// Sink the given CmpInst into user blocks to reduce the number of virtual /// registers that must be created and coalesced. This is a clear win except on /// targets with multiple condition code registers (PowerPC), where it might /// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. static bool SinkCmpExpression(CmpInst *CI) { BasicBlock *DefBB = CI->getParent(); /// Only insert a cmp in each block once. DenseMap InsertedCmps; bool MadeChange = false; for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end(); UI != E; ) { Use &TheUse = UI.getUse(); Instruction *User = cast(*UI); // Preincrement use iterator so we don't invalidate it. ++UI; // Don't bother for PHI nodes. if (isa(User)) continue; // Figure out which BB this cmp is used in. BasicBlock *UserBB = User->getParent(); // If this user is in the same block as the cmp, don't change the cmp. if (UserBB == DefBB) continue; // If we have already inserted a cmp into this block, use it. CmpInst *&InsertedCmp = InsertedCmps[UserBB]; if (!InsertedCmp) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); assert(InsertPt != UserBB->end()); InsertedCmp = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0), CI->getOperand(1), "", &*InsertPt); } // Replace a use of the cmp with a use of the new cmp. TheUse = InsertedCmp; MadeChange = true; ++NumCmpUses; } // If we removed all uses, nuke the cmp. if (CI->use_empty()) { CI->eraseFromParent(); MadeChange = true; } return MadeChange; } static bool OptimizeCmpExpression(CmpInst *CI) { if (SinkCmpExpression(CI)) return true; if (CombineUAddWithOverflow(CI)) return true; return false; } /// Check if the candidates could be combined with a shift instruction, which /// includes: /// 1. Truncate instruction /// 2. And instruction and the imm is a mask of the low bits: /// imm & (imm+1) == 0 static bool isExtractBitsCandidateUse(Instruction *User) { if (!isa(User)) { if (User->getOpcode() != Instruction::And || !isa(User->getOperand(1))) return false; const APInt &Cimm = cast(User->getOperand(1))->getValue(); if ((Cimm & (Cimm + 1)).getBoolValue()) return false; } return true; } /// Sink both shift and truncate instruction to the use of truncate's BB. static bool SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, DenseMap &InsertedShifts, const TargetLowering &TLI, const DataLayout &DL) { BasicBlock *UserBB = User->getParent(); DenseMap InsertedTruncs; TruncInst *TruncI = dyn_cast(User); bool MadeChange = false; for (Value::user_iterator TruncUI = TruncI->user_begin(), TruncE = TruncI->user_end(); TruncUI != TruncE;) { Use &TruncTheUse = TruncUI.getUse(); Instruction *TruncUser = cast(*TruncUI); // Preincrement use iterator so we don't invalidate it. ++TruncUI; int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode()); if (!ISDOpcode) continue; // If the use is actually a legal node, there will not be an // implicit truncate. // FIXME: always querying the result type is just an // approximation; some nodes' legality is determined by the // operand or other means. There's no good way to find out though. if (TLI.isOperationLegalOrCustom( ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true))) continue; // Don't bother for PHI nodes. if (isa(TruncUser)) continue; BasicBlock *TruncUserBB = TruncUser->getParent(); if (UserBB == TruncUserBB) continue; BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB]; CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB]; if (!InsertedShift && !InsertedTrunc) { BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); assert(InsertPt != TruncUserBB->end()); // Sink the shift if (ShiftI->getOpcode() == Instruction::AShr) InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", &*InsertPt); else InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", &*InsertPt); // Sink the trunc BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); TruncInsertPt++; assert(TruncInsertPt != TruncUserBB->end()); InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, TruncI->getType(), "", &*TruncInsertPt); MadeChange = true; TruncTheUse = InsertedTrunc; } } return MadeChange; } /// Sink the shift *right* instruction into user blocks if the uses could /// potentially be combined with this shift instruction and generate BitExtract /// instruction. It will only be applied if the architecture supports BitExtract /// instruction. Here is an example: /// BB1: /// %x.extract.shift = lshr i64 %arg1, 32 /// BB2: /// %x.extract.trunc = trunc i64 %x.extract.shift to i16 /// ==> /// /// BB2: /// %x.extract.shift.1 = lshr i64 %arg1, 32 /// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16 /// /// CodeGen will recoginze the pattern in BB2 and generate BitExtract /// instruction. /// Return true if any changes are made. static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, const TargetLowering &TLI, const DataLayout &DL) { BasicBlock *DefBB = ShiftI->getParent(); /// Only insert instructions in each block once. DenseMap InsertedShifts; bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType())); bool MadeChange = false; for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end(); UI != E;) { Use &TheUse = UI.getUse(); Instruction *User = cast(*UI); // Preincrement use iterator so we don't invalidate it. ++UI; // Don't bother for PHI nodes. if (isa(User)) continue; if (!isExtractBitsCandidateUse(User)) continue; BasicBlock *UserBB = User->getParent(); if (UserBB == DefBB) { // If the shift and truncate instruction are in the same BB. The use of // the truncate(TruncUse) may still introduce another truncate if not // legal. In this case, we would like to sink both shift and truncate // instruction to the BB of TruncUse. // for example: // BB1: // i64 shift.result = lshr i64 opnd, imm // trunc.result = trunc shift.result to i16 // // BB2: // ----> We will have an implicit truncate here if the architecture does // not have i16 compare. // cmp i16 trunc.result, opnd2 // if (isa(User) && shiftIsLegal // If the type of the truncate is legal, no trucate will be // introduced in other basic blocks. && (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType())))) MadeChange = SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL); continue; } // If we have already inserted a shift into this block, use it. BinaryOperator *&InsertedShift = InsertedShifts[UserBB]; if (!InsertedShift) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); assert(InsertPt != UserBB->end()); if (ShiftI->getOpcode() == Instruction::AShr) InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", &*InsertPt); else InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", &*InsertPt); MadeChange = true; } // Replace a use of the shift with a use of the new shift. TheUse = InsertedShift; } // If we removed all uses, nuke the shift. if (ShiftI->use_empty()) ShiftI->eraseFromParent(); return MadeChange; } // Translate a masked load intrinsic like // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, // <16 x i1> %mask, <16 x i32> %passthru) // to a chain of basic blocks, with loading element one-by-one if // the appropriate mask bit is set // // %1 = bitcast i8* %addr to i32* // %2 = extractelement <16 x i1> %mask, i32 0 // %3 = icmp eq i1 %2, true // br i1 %3, label %cond.load, label %else // //cond.load: ; preds = %0 // %4 = getelementptr i32* %1, i32 0 // %5 = load i32* %4 // %6 = insertelement <16 x i32> undef, i32 %5, i32 0 // br label %else // //else: ; preds = %0, %cond.load // %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] // %7 = extractelement <16 x i1> %mask, i32 1 // %8 = icmp eq i1 %7, true // br i1 %8, label %cond.load1, label %else2 // //cond.load1: ; preds = %else // %9 = getelementptr i32* %1, i32 1 // %10 = load i32* %9 // %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 // br label %else2 // //else2: ; preds = %else, %cond.load1 // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] // %12 = extractelement <16 x i1> %mask, i32 2 // %13 = icmp eq i1 %12, true // br i1 %13, label %cond.load4, label %else5 // static void ScalarizeMaskedLoad(CallInst *CI) { Value *Ptr = CI->getArgOperand(0); Value *Alignment = CI->getArgOperand(1); Value *Mask = CI->getArgOperand(2); Value *Src0 = CI->getArgOperand(3); unsigned AlignVal = cast(Alignment)->getZExtValue(); VectorType *VecType = dyn_cast(CI->getType()); assert(VecType && "Unexpected return type of masked load intrinsic"); Type *EltTy = CI->getType()->getVectorElementType(); IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); BasicBlock *CondBlock = nullptr; BasicBlock *PrevIfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); // Short-cut if the mask is all-true. bool IsAllOnesMask = isa(Mask) && cast(Mask)->isAllOnesValue(); if (IsAllOnesMask) { Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); CI->replaceAllUsesWith(NewI); CI->eraseFromParent(); return; } // Adjust alignment for the scalar instruction. AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); // Bitcast %addr fron i8* to EltTy* Type *NewPtrType = EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); unsigned VectorWidth = VecType->getNumElements(); Value *UndefVal = UndefValue::get(VecType); // The result vector Value *VResult = UndefVal; if (isa(Mask)) { for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { if (cast(Mask)->getOperand(Idx)->isNullValue()) continue; Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); } Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); CI->replaceAllUsesWith(NewI); CI->eraseFromParent(); return; } PHINode *Phi = nullptr; Value *PrevPhi = UndefVal; for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // %to_load = icmp eq i1 %mask_1, true // br i1 %to_load, label %cond.load, label %else // if (Idx > 0) { Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); Phi->addIncoming(VResult, CondBlock); Phi->addIncoming(PrevPhi, PrevIfBlock); PrevPhi = Phi; VResult = Phi; } Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, ConstantInt::get(Predicate->getType(), 1)); // Create "cond" block // // %EltAddr = getelementptr i32* %1, i32 0 // %Elt = load i32* %EltAddr // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx // CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); Builder.SetInsertPoint(InsertPt); Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); OldBr->eraseFromParent(); PrevIfBlock = IfBlock; IfBlock = NewIfBlock; } Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); Phi->addIncoming(VResult, CondBlock); Phi->addIncoming(PrevPhi, PrevIfBlock); Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); CI->replaceAllUsesWith(NewI); CI->eraseFromParent(); } // Translate a masked store intrinsic, like // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, // <16 x i1> %mask) // to a chain of basic blocks, that stores element one-by-one if // the appropriate mask bit is set // // %1 = bitcast i8* %addr to i32* // %2 = extractelement <16 x i1> %mask, i32 0 // %3 = icmp eq i1 %2, true // br i1 %3, label %cond.store, label %else // // cond.store: ; preds = %0 // %4 = extractelement <16 x i32> %val, i32 0 // %5 = getelementptr i32* %1, i32 0 // store i32 %4, i32* %5 // br label %else // // else: ; preds = %0, %cond.store // %6 = extractelement <16 x i1> %mask, i32 1 // %7 = icmp eq i1 %6, true // br i1 %7, label %cond.store1, label %else2 // // cond.store1: ; preds = %else // %8 = extractelement <16 x i32> %val, i32 1 // %9 = getelementptr i32* %1, i32 1 // store i32 %8, i32* %9 // br label %else2 // . . . static void ScalarizeMaskedStore(CallInst *CI) { Value *Src = CI->getArgOperand(0); Value *Ptr = CI->getArgOperand(1); Value *Alignment = CI->getArgOperand(2); Value *Mask = CI->getArgOperand(3); unsigned AlignVal = cast(Alignment)->getZExtValue(); VectorType *VecType = dyn_cast(Src->getType()); assert(VecType && "Unexpected data type in masked store intrinsic"); Type *EltTy = VecType->getElementType(); IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); // Short-cut if the mask is all-true. bool IsAllOnesMask = isa(Mask) && cast(Mask)->isAllOnesValue(); if (IsAllOnesMask) { Builder.CreateAlignedStore(Src, Ptr, AlignVal); CI->eraseFromParent(); return; } // Adjust alignment for the scalar instruction. AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); // Bitcast %addr fron i8* to EltTy* Type *NewPtrType = EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); unsigned VectorWidth = VecType->getNumElements(); if (isa(Mask)) { for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { if (cast(Mask)->getOperand(Idx)->isNullValue()) continue; Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); Builder.CreateAlignedStore(OneElt, Gep, AlignVal); } CI->eraseFromParent(); return; } for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // %to_store = icmp eq i1 %mask_1, true // br i1 %to_store, label %cond.store, label %else // Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, ConstantInt::get(Predicate->getType(), 1)); // Create "cond" block // // %OneElt = extractelement <16 x i32> %Src, i32 Idx // %EltAddr = getelementptr i32* %1, i32 0 // %store i32 %OneElt, i32* %EltAddr // BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); Builder.SetInsertPoint(InsertPt); Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); Builder.CreateAlignedStore(OneElt, Gep, AlignVal); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); OldBr->eraseFromParent(); IfBlock = NewIfBlock; } CI->eraseFromParent(); } // Translate a masked gather intrinsic like // <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, // <16 x i1> %Mask, <16 x i32> %Src) // to a chain of basic blocks, with loading element one-by-one if // the appropriate mask bit is set // // % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind // % Mask0 = extractelement <16 x i1> %Mask, i32 0 // % ToLoad0 = icmp eq i1 % Mask0, true // br i1 % ToLoad0, label %cond.load, label %else // // cond.load: // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 // % Load0 = load i32, i32* % Ptr0, align 4 // % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 // br label %else // // else: // %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] // % Mask1 = extractelement <16 x i1> %Mask, i32 1 // % ToLoad1 = icmp eq i1 % Mask1, true // br i1 % ToLoad1, label %cond.load1, label %else2 // // cond.load1: // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 // % Load1 = load i32, i32* % Ptr1, align 4 // % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 // br label %else2 // . . . // % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src // ret <16 x i32> %Result static void ScalarizeMaskedGather(CallInst *CI) { Value *Ptrs = CI->getArgOperand(0); Value *Alignment = CI->getArgOperand(1); Value *Mask = CI->getArgOperand(2); Value *Src0 = CI->getArgOperand(3); VectorType *VecType = dyn_cast(CI->getType()); assert(VecType && "Unexpected return type of masked load intrinsic"); IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); BasicBlock *CondBlock = nullptr; BasicBlock *PrevIfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); unsigned AlignVal = cast(Alignment)->getZExtValue(); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); Value *UndefVal = UndefValue::get(VecType); // The result vector Value *VResult = UndefVal; unsigned VectorWidth = VecType->getNumElements(); // Shorten the way if the mask is a vector of constants. bool IsConstMask = isa(Mask); if (IsConstMask) { for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { if (cast(Mask)->getOperand(Idx)->isNullValue()) continue; Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), "Ptr" + Twine(Idx)); LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx)); } Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); CI->replaceAllUsesWith(NewI); CI->eraseFromParent(); return; } PHINode *Phi = nullptr; Value *PrevPhi = UndefVal; for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %Mask1 = extractelement <16 x i1> %Mask, i32 1 // %ToLoad1 = icmp eq i1 %Mask1, true // br i1 %ToLoad1, label %cond.load, label %else // if (Idx > 0) { Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); Phi->addIncoming(VResult, CondBlock); Phi->addIncoming(PrevPhi, PrevIfBlock); PrevPhi = Phi; VResult = Phi; } Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), "Mask" + Twine(Idx)); Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, ConstantInt::get(Predicate->getType(), 1), "ToLoad" + Twine(Idx)); // Create "cond" block // // %EltAddr = getelementptr i32* %1, i32 0 // %Elt = load i32* %EltAddr // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx // CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); Builder.SetInsertPoint(InsertPt); Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), "Ptr" + Twine(Idx)); LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx)); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); OldBr->eraseFromParent(); PrevIfBlock = IfBlock; IfBlock = NewIfBlock; } Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); Phi->addIncoming(VResult, CondBlock); Phi->addIncoming(PrevPhi, PrevIfBlock); Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); CI->replaceAllUsesWith(NewI); CI->eraseFromParent(); } // Translate a masked scatter intrinsic, like // void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, // <16 x i1> %Mask) // to a chain of basic blocks, that stores element one-by-one if // the appropriate mask bit is set. // // % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind // % Mask0 = extractelement <16 x i1> % Mask, i32 0 // % ToStore0 = icmp eq i1 % Mask0, true // br i1 %ToStore0, label %cond.store, label %else // // cond.store: // % Elt0 = extractelement <16 x i32> %Src, i32 0 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 // store i32 %Elt0, i32* % Ptr0, align 4 // br label %else // // else: // % Mask1 = extractelement <16 x i1> % Mask, i32 1 // % ToStore1 = icmp eq i1 % Mask1, true // br i1 % ToStore1, label %cond.store1, label %else2 // // cond.store1: // % Elt1 = extractelement <16 x i32> %Src, i32 1 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 // store i32 % Elt1, i32* % Ptr1, align 4 // br label %else2 // . . . static void ScalarizeMaskedScatter(CallInst *CI) { Value *Src = CI->getArgOperand(0); Value *Ptrs = CI->getArgOperand(1); Value *Alignment = CI->getArgOperand(2); Value *Mask = CI->getArgOperand(3); assert(isa(Src->getType()) && "Unexpected data type in masked scatter intrinsic"); assert(isa(Ptrs->getType()) && isa(Ptrs->getType()->getVectorElementType()) && "Vector of pointers is expected in masked scatter intrinsic"); IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); unsigned AlignVal = cast(Alignment)->getZExtValue(); unsigned VectorWidth = Src->getType()->getVectorNumElements(); // Shorten the way if the mask is a vector of constants. bool IsConstMask = isa(Mask); if (IsConstMask) { for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { if (cast(Mask)->getOperand(Idx)->isNullValue()) continue; Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), "Elt" + Twine(Idx)); Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), "Ptr" + Twine(Idx)); Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); } CI->eraseFromParent(); return; } for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx // % ToStore = icmp eq i1 % Mask1, true // br i1 % ToStore, label %cond.store, label %else // Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), "Mask" + Twine(Idx)); Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, ConstantInt::get(Predicate->getType(), 1), "ToStore" + Twine(Idx)); // Create "cond" block // // % Elt1 = extractelement <16 x i32> %Src, i32 1 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 // %store i32 % Elt1, i32* % Ptr1 // BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); Builder.SetInsertPoint(InsertPt); Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), "Elt" + Twine(Idx)); Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), "Ptr" + Twine(Idx)); Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); OldBr->eraseFromParent(); IfBlock = NewIfBlock; } CI->eraseFromParent(); } /// If counting leading or trailing zeros is an expensive operation and a zero /// input is defined, add a check for zero to avoid calling the intrinsic. /// /// We want to transform: /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) /// /// into: /// entry: /// %cmpz = icmp eq i64 %A, 0 /// br i1 %cmpz, label %cond.end, label %cond.false /// cond.false: /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) /// br label %cond.end /// cond.end: /// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] /// /// If the transform is performed, return true and set ModifiedDT to true. static bool despeculateCountZeros(IntrinsicInst *CountZeros, const TargetLowering *TLI, const DataLayout *DL, bool &ModifiedDT) { if (!TLI || !DL) return false; // If a zero input is undefined, it doesn't make sense to despeculate that. if (match(CountZeros->getOperand(1), m_One())) return false; // If it's cheap to speculate, there's nothing to do. auto IntrinsicID = CountZeros->getIntrinsicID(); if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) return false; // Only handle legal scalar cases. Anything else requires too much work. Type *Ty = CountZeros->getType(); unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize()) return false; // The intrinsic will be sunk behind a compare against zero and branch. BasicBlock *StartBlock = CountZeros->getParent(); BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); // Create another block after the count zero intrinsic. A PHI will be added // in this block to select the result of the intrinsic or the bit-width // constant if the input to the intrinsic is zero. BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); // Set up a builder to create a compare, conditional branch, and PHI. IRBuilder<> Builder(CountZeros->getContext()); Builder.SetInsertPoint(StartBlock->getTerminator()); Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); // Replace the unconditional branch that was created by the first split with // a compare against zero and a conditional branch. Value *Zero = Constant::getNullValue(Ty); Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); Builder.CreateCondBr(Cmp, EndBlock, CallBlock); StartBlock->getTerminator()->eraseFromParent(); // Create a PHI in the end block to select either the output of the intrinsic // or the bit width of the operand. Builder.SetInsertPoint(&EndBlock->front()); PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); CountZeros->replaceAllUsesWith(PN); Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); PN->addIncoming(BitWidth, StartBlock); PN->addIncoming(CountZeros, CallBlock); // We are explicitly handling the zero case, so we can set the intrinsic's // undefined zero argument to 'true'. This will also prevent reprocessing the // intrinsic; we only despeculate when a zero input is defined. CountZeros->setArgOperand(1, Builder.getTrue()); ModifiedDT = true; return true; } bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. // If we found an inline asm expession, and if the target knows how to // lower it to normal LLVM code, do so now. if (TLI && isa(CI->getCalledValue())) { if (TLI->ExpandInlineAsm(CI)) { // Avoid invalidating the iterator. CurInstIterator = BB->begin(); // Avoid processing instructions out of order, which could cause // reuse before a value is defined. SunkAddrs.clear(); return true; } // Sink address computing for memory operands into the block. if (optimizeInlineAsmInst(CI)) return true; } // Align the pointer arguments to this call if the target thinks it's a good // idea unsigned MinSize, PrefAlign; if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { for (auto &Arg : CI->arg_operands()) { // We want to align both objects whose address is used directly and // objects whose address is used in casts and GEPs, though it only makes // sense for GEPs if the offset is a multiple of the desired alignment and // if size - offset meets the size threshold. if (!Arg->getType()->isPointerTy()) continue; APInt Offset(DL->getPointerSizeInBits( cast(Arg->getType())->getAddressSpace()), 0); Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); uint64_t Offset2 = Offset.getLimitedValue(); if ((Offset2 & (PrefAlign-1)) != 0) continue; AllocaInst *AI; if ((AI = dyn_cast(Val)) && AI->getAlignment() < PrefAlign && DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2) AI->setAlignment(PrefAlign); // Global variables can only be aligned if they are defined in this // object (i.e. they are uniquely initialized in this object), and // over-aligning global variables that have an explicit section is // forbidden. GlobalVariable *GV; if ((GV = dyn_cast(Val)) && GV->canIncreaseAlignment() && GV->getAlignment() < PrefAlign && DL->getTypeAllocSize(GV->getType()->getElementType()) >= MinSize + Offset2) GV->setAlignment(PrefAlign); } // If this is a memcpy (or similar) then we may be able to improve the // alignment if (MemIntrinsic *MI = dyn_cast(CI)) { unsigned Align = getKnownAlignment(MI->getDest(), *DL); if (MemTransferInst *MTI = dyn_cast(MI)) Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL)); if (Align > MI->getAlignment()) MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align)); } } IntrinsicInst *II = dyn_cast(CI); if (II) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { // Lower all uses of llvm.objectsize.* bool Min = (cast(II->getArgOperand(1))->getZExtValue() == 1); Type *ReturnTy = CI->getType(); Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); // Substituting this can cause recursive simplifications, which can // invalidate our iterator. Use a WeakVH to hold onto it in case this // happens. WeakVH IterHandle(&*CurInstIterator); replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); // If the iterator instruction was recursively deleted, start over at the // start of the block. if (IterHandle != CurInstIterator.getNodePtrUnchecked()) { CurInstIterator = BB->begin(); SunkAddrs.clear(); } return true; } case Intrinsic::masked_load: { // Scalarize unsupported vector masked load if (!TTI->isLegalMaskedLoad(CI->getType())) { ScalarizeMaskedLoad(CI); ModifiedDT = true; return true; } return false; } case Intrinsic::masked_store: { if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { ScalarizeMaskedStore(CI); ModifiedDT = true; return true; } return false; } case Intrinsic::masked_gather: { if (!TTI->isLegalMaskedGather(CI->getType())) { ScalarizeMaskedGather(CI); ModifiedDT = true; return true; } return false; } case Intrinsic::masked_scatter: { if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { ScalarizeMaskedScatter(CI); ModifiedDT = true; return true; } return false; } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast(CI->getArgOperand(0)); if (!ExtVal || !ExtVal->hasOneUse() || ExtVal->getParent() == CI->getParent()) return false; // Sink a zext feeding stlxr/stxr before it, so it can be folded into it. ExtVal->moveBefore(CI); // Mark this instruction as "inserted by CGP", so that other // optimizations don't touch it. InsertedInsts.insert(ExtVal); return true; } case Intrinsic::invariant_group_barrier: II->replaceAllUsesWith(II->getArgOperand(0)); II->eraseFromParent(); return true; case Intrinsic::cttz: case Intrinsic::ctlz: // If counting zeros is expensive, try to avoid it. return despeculateCountZeros(II, TLI, DL, ModifiedDT); } if (TLI) { // Unknown address space. // TODO: Target hook to pick which address space the intrinsic cares // about? unsigned AddrSpace = ~0u; SmallVector PtrOps; Type *AccessTy; if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) while (!PtrOps.empty()) if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) return true; } } // From here on out we're working with named functions. if (!CI->getCalledFunction()) return false; // Lower all default uses of _chk calls. This is very similar // to what InstCombineCalls does, but here we are only lowering calls // to fortified library functions (e.g. __memcpy_chk) that have the default // "don't know" as the objectsize. Anything else should be left alone. FortifiedLibCallSimplifier Simplifier(TLInfo, true); if (Value *V = Simplifier.optimizeCall(CI)) { CI->replaceAllUsesWith(V); CI->eraseFromParent(); return true; } return false; } /// Look for opportunities to duplicate return instructions to the predecessor /// to enable tail call optimizations. The case it is currently looking for is: /// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// br label %return /// bb1: /// %tmp1 = tail call i32 @f1() /// br label %return /// bb2: /// %tmp2 = tail call i32 @f2() /// br label %return /// return: /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] /// ret i32 %retval /// @endcode /// /// => /// /// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// ret i32 %tmp0 /// bb1: /// %tmp1 = tail call i32 @f1() /// ret i32 %tmp1 /// bb2: /// %tmp2 = tail call i32 @f2() /// ret i32 %tmp2 /// @endcode bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { if (!TLI) return false; ReturnInst *RI = dyn_cast(BB->getTerminator()); if (!RI) return false; PHINode *PN = nullptr; BitCastInst *BCI = nullptr; Value *V = RI->getReturnValue(); if (V) { BCI = dyn_cast(V); if (BCI) V = BCI->getOperand(0); PN = dyn_cast(V); if (!PN) return false; } if (PN && PN->getParent() != BB) return false; // It's not safe to eliminate the sign / zero extension of the return value. // See llvm::isInTailCallPosition(). const Function *F = BB->getParent(); AttributeSet CallerAttrs = F->getAttributes(); if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) return false; // Make sure there are no instructions between the PHI and return, or that the // return is the first instruction in the block. if (PN) { BasicBlock::iterator BI = BB->begin(); do { ++BI; } while (isa(BI)); if (&*BI == BCI) // Also skip over the bitcast. ++BI; if (&*BI != RI) return false; } else { BasicBlock::iterator BI = BB->begin(); while (isa(BI)) ++BI; if (&*BI != RI) return false; } /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail /// call. SmallVector TailCalls; if (PN) { for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { CallInst *CI = dyn_cast(PN->getIncomingValue(I)); // Make sure the phi value is indeed produced by the tail call. if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) && TLI->mayBeEmittedAsTailCall(CI)) TailCalls.push_back(CI); } } else { SmallPtrSet VisitedBBs; for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { if (!VisitedBBs.insert(*PI).second) continue; BasicBlock::InstListType &InstList = (*PI)->getInstList(); BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); do { ++RI; } while (RI != RE && isa(&*RI)); if (RI == RE) continue; CallInst *CI = dyn_cast(&*RI); if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI)) TailCalls.push_back(CI); } } bool Changed = false; for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) { CallInst *CI = TailCalls[i]; CallSite CS(CI); // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. AttributeSet CalleeAttrs = CS.getAttributes(); if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). removeAttribute(Attribute::NoAlias) != AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). removeAttribute(Attribute::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to // the return block. BasicBlock *CallBB = CI->getParent(); BranchInst *BI = dyn_cast(CallBB->getTerminator()); if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB) continue; // Duplicate the return into CallBB. (void)FoldReturnIntoUncondBranch(RI, BB, CallBB); ModifiedDT = Changed = true; ++NumRetsDup; } // If we eliminated all predecessors of the block, delete the block now. if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) BB->eraseFromParent(); return Changed; } //===----------------------------------------------------------------------===// // Memory Optimization //===----------------------------------------------------------------------===// namespace { /// This is an extended version of TargetLowering::AddrMode /// which holds actual Value*'s for register values. struct ExtAddrMode : public TargetLowering::AddrMode { Value *BaseReg; Value *ScaledReg; ExtAddrMode() : BaseReg(nullptr), ScaledReg(nullptr) {} void print(raw_ostream &OS) const; void dump() const; bool operator==(const ExtAddrMode& O) const { return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale); } }; #ifndef NDEBUG static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { AM.print(OS); return OS; } #endif void ExtAddrMode::print(raw_ostream &OS) const { bool NeedPlus = false; OS << "["; if (BaseGV) { OS << (NeedPlus ? " + " : "") << "GV:"; BaseGV->printAsOperand(OS, /*PrintType=*/false); NeedPlus = true; } if (BaseOffs) { OS << (NeedPlus ? " + " : "") << BaseOffs; NeedPlus = true; } if (BaseReg) { OS << (NeedPlus ? " + " : "") << "Base:"; BaseReg->printAsOperand(OS, /*PrintType=*/false); NeedPlus = true; } if (Scale) { OS << (NeedPlus ? " + " : "") << Scale << "*"; ScaledReg->printAsOperand(OS, /*PrintType=*/false); } OS << ']'; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void ExtAddrMode::dump() const { print(dbgs()); dbgs() << '\n'; } #endif /// \brief This class provides transaction based operation on the IR. /// Every change made through this class is recorded in the internal state and /// can be undone (rollback) until commit is called. class TypePromotionTransaction { /// \brief This represents the common interface of the individual transaction. /// Each class implements the logic for doing one specific modification on /// the IR via the TypePromotionTransaction. class TypePromotionAction { protected: /// The Instruction modified. Instruction *Inst; public: /// \brief Constructor of the action. /// The constructor performs the related action on the IR. TypePromotionAction(Instruction *Inst) : Inst(Inst) {} virtual ~TypePromotionAction() {} /// \brief Undo the modification done by this action. /// When this method is called, the IR must be in the same state as it was /// before this action was applied. /// \pre Undoing the action works if and only if the IR is in the exact same /// state as it was directly after this action was applied. virtual void undo() = 0; /// \brief Advocate every change made by this action. /// When the results on the IR of the action are to be kept, it is important /// to call this function, otherwise hidden information may be kept forever. virtual void commit() { // Nothing to be done, this action is not doing anything. } }; /// \brief Utility to remember the position of an instruction. class InsertionHandler { /// Position of an instruction. /// Either an instruction: /// - Is the first in a basic block: BB is used. /// - Has a previous instructon: PrevInst is used. union { Instruction *PrevInst; BasicBlock *BB; } Point; /// Remember whether or not the instruction had a previous instruction. bool HasPrevInstruction; public: /// \brief Record the position of \p Inst. InsertionHandler(Instruction *Inst) { BasicBlock::iterator It = Inst->getIterator(); HasPrevInstruction = (It != (Inst->getParent()->begin())); if (HasPrevInstruction) Point.PrevInst = &*--It; else Point.BB = Inst->getParent(); } /// \brief Insert \p Inst at the recorded position. void insert(Instruction *Inst) { if (HasPrevInstruction) { if (Inst->getParent()) Inst->removeFromParent(); Inst->insertAfter(Point.PrevInst); } else { Instruction *Position = &*Point.BB->getFirstInsertionPt(); if (Inst->getParent()) Inst->moveBefore(Position); else Inst->insertBefore(Position); } } }; /// \brief Move an instruction before another. class InstructionMoveBefore : public TypePromotionAction { /// Original position of the instruction. InsertionHandler Position; public: /// \brief Move \p Inst before \p Before. InstructionMoveBefore(Instruction *Inst, Instruction *Before) : TypePromotionAction(Inst), Position(Inst) { DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before << "\n"); Inst->moveBefore(Before); } /// \brief Move the instruction back to its original position. void undo() override { DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n"); Position.insert(Inst); } }; /// \brief Set the operand of an instruction with a new value. class OperandSetter : public TypePromotionAction { /// Original operand of the instruction. Value *Origin; /// Index of the modified instruction. unsigned Idx; public: /// \brief Set \p Idx operand of \p Inst with \p NewVal. OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal) : TypePromotionAction(Inst), Idx(Idx) { DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n" << "for:" << *Inst << "\n" << "with:" << *NewVal << "\n"); Origin = Inst->getOperand(Idx); Inst->setOperand(Idx, NewVal); } /// \brief Restore the original value of the instruction. void undo() override { DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n" << "for: " << *Inst << "\n" << "with: " << *Origin << "\n"); Inst->setOperand(Idx, Origin); } }; /// \brief Hide the operands of an instruction. /// Do as if this instruction was not using any of its operands. class OperandsHider : public TypePromotionAction { /// The list of original operands. SmallVector OriginalValues; public: /// \brief Remove \p Inst from the uses of the operands of \p Inst. OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) { DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n"); unsigned NumOpnds = Inst->getNumOperands(); OriginalValues.reserve(NumOpnds); for (unsigned It = 0; It < NumOpnds; ++It) { // Save the current operand. Value *Val = Inst->getOperand(It); OriginalValues.push_back(Val); // Set a dummy one. // We could use OperandSetter here, but that would imply an overhead // that we are not willing to pay. Inst->setOperand(It, UndefValue::get(Val->getType())); } } /// \brief Restore the original list of uses. void undo() override { DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n"); for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It) Inst->setOperand(It, OriginalValues[It]); } }; /// \brief Build a truncate instruction. class TruncBuilder : public TypePromotionAction { Value *Val; public: /// \brief Build a truncate instruction of \p Opnd producing a \p Ty /// result. /// trunc Opnd to Ty. TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) { IRBuilder<> Builder(Opnd); Val = Builder.CreateTrunc(Opnd, Ty, "promoted"); DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n"); } /// \brief Get the built value. Value *getBuiltValue() { return Val; } /// \brief Remove the built instruction. void undo() override { DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n"); if (Instruction *IVal = dyn_cast(Val)) IVal->eraseFromParent(); } }; /// \brief Build a sign extension instruction. class SExtBuilder : public TypePromotionAction { Value *Val; public: /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty /// result. /// sext Opnd to Ty. SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) : TypePromotionAction(InsertPt) { IRBuilder<> Builder(InsertPt); Val = Builder.CreateSExt(Opnd, Ty, "promoted"); DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n"); } /// \brief Get the built value. Value *getBuiltValue() { return Val; } /// \brief Remove the built instruction. void undo() override { DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n"); if (Instruction *IVal = dyn_cast(Val)) IVal->eraseFromParent(); } }; /// \brief Build a zero extension instruction. class ZExtBuilder : public TypePromotionAction { Value *Val; public: /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty /// result. /// zext Opnd to Ty. ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) : TypePromotionAction(InsertPt) { IRBuilder<> Builder(InsertPt); Val = Builder.CreateZExt(Opnd, Ty, "promoted"); DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n"); } /// \brief Get the built value. Value *getBuiltValue() { return Val; } /// \brief Remove the built instruction. void undo() override { DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n"); if (Instruction *IVal = dyn_cast(Val)) IVal->eraseFromParent(); } }; /// \brief Mutate an instruction to another type. class TypeMutator : public TypePromotionAction { /// Record the original type. Type *OrigTy; public: /// \brief Mutate the type of \p Inst into \p NewTy. TypeMutator(Instruction *Inst, Type *NewTy) : TypePromotionAction(Inst), OrigTy(Inst->getType()) { DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy << "\n"); Inst->mutateType(NewTy); } /// \brief Mutate the instruction back to its original type. void undo() override { DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy << "\n"); Inst->mutateType(OrigTy); } }; /// \brief Replace the uses of an instruction by another instruction. class UsesReplacer : public TypePromotionAction { /// Helper structure to keep track of the replaced uses. struct InstructionAndIdx { /// The instruction using the instruction. Instruction *Inst; /// The index where this instruction is used for Inst. unsigned Idx; InstructionAndIdx(Instruction *Inst, unsigned Idx) : Inst(Inst), Idx(Idx) {} }; /// Keep track of the original uses (pair Instruction, Index). SmallVector OriginalUses; typedef SmallVectorImpl::iterator use_iterator; public: /// \brief Replace all the use of \p Inst by \p New. UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) { DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New << "\n"); // Record the original uses. for (Use &U : Inst->uses()) { Instruction *UserI = cast(U.getUser()); OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo())); } // Now, we can replace the uses. Inst->replaceAllUsesWith(New); } /// \brief Reassign the original uses of Inst to Inst. void undo() override { DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n"); for (use_iterator UseIt = OriginalUses.begin(), EndIt = OriginalUses.end(); UseIt != EndIt; ++UseIt) { UseIt->Inst->setOperand(UseIt->Idx, Inst); } } }; /// \brief Remove an instruction from the IR. class InstructionRemover : public TypePromotionAction { /// Original position of the instruction. InsertionHandler Inserter; /// Helper structure to hide all the link to the instruction. In other /// words, this helps to do as if the instruction was removed. OperandsHider Hider; /// Keep track of the uses replaced, if any. UsesReplacer *Replacer; public: /// \brief Remove all reference of \p Inst and optinally replace all its /// uses with New. /// \pre If !Inst->use_empty(), then New != nullptr InstructionRemover(Instruction *Inst, Value *New = nullptr) : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), Replacer(nullptr) { if (New) Replacer = new UsesReplacer(Inst, New); DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); Inst->removeFromParent(); } ~InstructionRemover() override { delete Replacer; } /// \brief Really remove the instruction. void commit() override { delete Inst; } /// \brief Resurrect the instruction and reassign it to the proper uses if /// new value was provided when build this action. void undo() override { DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n"); Inserter.insert(Inst); if (Replacer) Replacer->undo(); Hider.undo(); } }; public: /// Restoration point. /// The restoration point is a pointer to an action instead of an iterator /// because the iterator may be invalidated but not the pointer. typedef const TypePromotionAction *ConstRestorationPt; /// Advocate every changes made in that transaction. void commit(); /// Undo all the changes made after the given point. void rollback(ConstRestorationPt Point); /// Get the current restoration point. ConstRestorationPt getRestorationPoint() const; /// \name API for IR modification with state keeping to support rollback. /// @{ /// Same as Instruction::setOperand. void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal); /// Same as Instruction::eraseFromParent. void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr); /// Same as Value::replaceAllUsesWith. void replaceAllUsesWith(Instruction *Inst, Value *New); /// Same as Value::mutateType. void mutateType(Instruction *Inst, Type *NewTy); /// Same as IRBuilder::createTrunc. Value *createTrunc(Instruction *Opnd, Type *Ty); /// Same as IRBuilder::createSExt. Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty); /// Same as IRBuilder::createZExt. Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty); /// Same as Instruction::moveBefore. void moveBefore(Instruction *Inst, Instruction *Before); /// @} private: /// The ordered list of actions made so far. SmallVector, 16> Actions; typedef SmallVectorImpl>::iterator CommitPt; }; void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, Value *NewVal) { Actions.push_back( make_unique(Inst, Idx, NewVal)); } void TypePromotionTransaction::eraseInstruction(Instruction *Inst, Value *NewVal) { Actions.push_back( make_unique(Inst, NewVal)); } void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, Value *New) { Actions.push_back(make_unique(Inst, New)); } void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) { Actions.push_back(make_unique(Inst, NewTy)); } Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, Type *Ty) { std::unique_ptr Ptr(new TruncBuilder(Opnd, Ty)); Value *Val = Ptr->getBuiltValue(); Actions.push_back(std::move(Ptr)); return Val; } Value *TypePromotionTransaction::createSExt(Instruction *Inst, Value *Opnd, Type *Ty) { std::unique_ptr Ptr(new SExtBuilder(Inst, Opnd, Ty)); Value *Val = Ptr->getBuiltValue(); Actions.push_back(std::move(Ptr)); return Val; } Value *TypePromotionTransaction::createZExt(Instruction *Inst, Value *Opnd, Type *Ty) { std::unique_ptr Ptr(new ZExtBuilder(Inst, Opnd, Ty)); Value *Val = Ptr->getBuiltValue(); Actions.push_back(std::move(Ptr)); return Val; } void TypePromotionTransaction::moveBefore(Instruction *Inst, Instruction *Before) { Actions.push_back( make_unique(Inst, Before)); } TypePromotionTransaction::ConstRestorationPt TypePromotionTransaction::getRestorationPoint() const { return !Actions.empty() ? Actions.back().get() : nullptr; } void TypePromotionTransaction::commit() { for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; ++It) (*It)->commit(); Actions.clear(); } void TypePromotionTransaction::rollback( TypePromotionTransaction::ConstRestorationPt Point) { while (!Actions.empty() && Point != Actions.back().get()) { std::unique_ptr Curr = Actions.pop_back_val(); Curr->undo(); } } /// \brief A helper class for matching addressing modes. /// /// This encapsulates the logic for matching the target-legal addressing modes. class AddressingModeMatcher { SmallVectorImpl &AddrModeInsts; const TargetMachine &TM; const TargetLowering &TLI; const DataLayout &DL; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and /// the memory instruction that we're computing this address for. Type *AccessTy; unsigned AddrSpace; Instruction *MemoryInst; /// This is the addressing mode that we're building up. This is /// part of the return value of this addressing mode matching stuff. ExtAddrMode &AddrMode; /// The instructions inserted by other CodeGenPrepare optimizations. const SetOfInstrs &InsertedInsts; /// A map from the instructions to their type before promotion. InstrToOrigTy &PromotedInsts; /// The ongoing transaction where every action should be registered. TypePromotionTransaction &TPT; /// This is set to true when we should not do profitability checks. /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; AddressingModeMatcher(SmallVectorImpl &AMI, const TargetMachine &TM, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) : AddrModeInsts(AMI), TM(TM), TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) ->getTargetLowering()), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT) { IgnoreProfitability = false; } public: /// Find the maximal addressing mode that a load/store of V can fold, /// give an access type of AccessTy. This returns a list of involved /// instructions in AddrModeInsts. /// \p InsertedInsts The instructions inserted by other CodeGenPrepare /// optimizations. /// \p PromotedInsts maps the instructions to their type before promotion. /// \p The ongoing transaction where every action should be registered. static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, SmallVectorImpl &AddrModeInsts, const TargetMachine &TM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) { ExtAddrMode Result; bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT).matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; } private: bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); bool matchAddr(Value *V, unsigned Depth); bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth, bool *MovedAway = nullptr); bool isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter); bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); bool isPromotionProfitable(unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const; }; /// Try adding ScaleReg*Scale to the current addressing mode. /// Return true and update AddrMode if this addr mode is legal for the target, /// false if not. bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth) { // If Scale is 1, then this is the same as adding ScaleReg to the addressing // mode. Just process that directly. if (Scale == 1) return matchAddr(ScaleReg, Depth); // If the scale is 0, it takes nothing to add this. if (Scale == 0) return true; // If we already have a scale of this value, we can add to it, otherwise, we // need an available scale field. if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) return false; ExtAddrMode TestAddrMode = AddrMode; // Add scale to turn X*4+X*3 -> X*7. This could also do things like // [A+B + A*7] -> [B+A*8]. TestAddrMode.Scale += Scale; TestAddrMode.ScaledReg = ScaleReg; // If the new address isn't legal, bail out. if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) return false; // It was legal, so commit it. AddrMode = TestAddrMode; // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now // to see if ScaleReg is actually X+C. If so, we can turn this into adding // X*Scale + C*Scale to addr mode. ConstantInt *CI = nullptr; Value *AddLHS = nullptr; if (isa(ScaleReg) && // not a constant expr. match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { TestAddrMode.ScaledReg = AddLHS; TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; // If this addressing mode is legal, commit it and remember that we folded // this instruction. if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) { AddrModeInsts.push_back(cast(ScaleReg)); AddrMode = TestAddrMode; return true; } } // Otherwise, not (x+c)*scale, just return what we have. return true; } /// This is a little filter, which returns true if an addressing computation /// involving I might be folded into a load/store accessing it. /// This doesn't need to be perfect, but needs to accept at least /// the set of instructions that MatchOperationAddr can. static bool MightBeFoldableInst(Instruction *I) { switch (I->getOpcode()) { case Instruction::BitCast: case Instruction::AddrSpaceCast: // Don't touch identity bitcasts. if (I->getType() == I->getOperand(0)->getType()) return false; return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); case Instruction::PtrToInt: // PtrToInt is always a noop, as we know that the int type is pointer sized. return true; case Instruction::IntToPtr: // We know the input is intptr_t, so this is foldable. return true; case Instruction::Add: return true; case Instruction::Mul: case Instruction::Shl: // Can only handle X*C and X << C. return isa(I->getOperand(1)); case Instruction::GetElementPtr: return true; default: return false; } } /// \brief Check whether or not \p Val is a legal instruction for \p TLI. /// \note \p Val is assumed to be the product of some type promotion. /// Therefore if \p Val has an undefined state in \p TLI, this is assumed /// to be legal, as the non-promoted value would have had the same state. static bool isPromotedInstructionLegal(const TargetLowering &TLI, const DataLayout &DL, Value *Val) { Instruction *PromotedInst = dyn_cast(Val); if (!PromotedInst) return false; int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); // If the ISDOpcode is undefined, it was undefined before the promotion. if (!ISDOpcode) return true; // Otherwise, check if the promoted instruction is legal or not. return TLI.isOperationLegalOrCustom( ISDOpcode, TLI.getValueType(DL, PromotedInst->getType())); } /// \brief Hepler class to perform type promotion. class TypePromotionHelper { /// \brief Utility function to check whether or not a sign or zero extension /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by /// either using the operands of \p Inst or promoting \p Inst. /// The type of the extension is defined by \p IsSExt. /// In other words, check if: /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType. /// #1 Promotion applies: /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...). /// #2 Operand reuses: /// ext opnd1 to ConsideredExtType. /// \p PromotedInsts maps the instructions to their type before promotion. static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType, const InstrToOrigTy &PromotedInsts, bool IsSExt); /// \brief Utility function to determine if \p OpIdx should be promoted when /// promoting \p Inst. static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { return !(isa(Inst) && OpIdx == 0); } /// \brief Utility function to promote the operand of \p Ext when this /// operand is a promotable trunc or sext or zext. /// \p PromotedInsts maps the instructions to their type before promotion. /// \p CreatedInstsCost[out] contains the cost of all instructions /// created to promote the operand of Ext. /// Newly added extensions are inserted in \p Exts. /// Newly added truncates are inserted in \p Truncs. /// Should never be called directly. /// \return The promoted value which is used instead of Ext. static Value *promoteOperandForTruncAndAnyExt( Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI); /// \brief Utility function to promote the operand of \p Ext when this /// operand is promotable and is not a supported trunc or sext. /// \p PromotedInsts maps the instructions to their type before promotion. /// \p CreatedInstsCost[out] contains the cost of all the instructions /// created to promote the operand of Ext. /// Newly added extensions are inserted in \p Exts. /// Newly added truncates are inserted in \p Truncs. /// Should never be called directly. /// \return The promoted value which is used instead of Ext. static Value *promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI, bool IsSExt); /// \see promoteOperandForOther. static Value *signExtendOperandForOther( Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI) { return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, Exts, Truncs, TLI, true); } /// \see promoteOperandForOther. static Value *zeroExtendOperandForOther( Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI) { return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, Exts, Truncs, TLI, false); } public: /// Type for the utility function that promotes the operand of Ext. typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI); /// \brief Given a sign/zero extend instruction \p Ext, return the approriate /// action to promote the operand of \p Ext instead of using Ext. /// \return NULL if no promotable action is possible with the current /// sign extension. /// \p InsertedInsts keeps track of all the instructions inserted by the /// other CodeGenPrepare optimizations. This information is important /// because we do not want to promote these instructions as CodeGenPrepare /// will reinsert them later. Thus creating an infinite loop: create/remove. /// \p PromotedInsts maps the instructions to their type before promotion. static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts, const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts); }; bool TypePromotionHelper::canGetThrough(const Instruction *Inst, Type *ConsideredExtType, const InstrToOrigTy &PromotedInsts, bool IsSExt) { // The promotion helper does not know how to deal with vector types yet. // To be able to fix that, we would need to fix the places where we // statically extend, e.g., constants and such. if (Inst->getType()->isVectorTy()) return false; // We can always get through zext. if (isa(Inst)) return true; // sext(sext) is ok too. if (IsSExt && isa(Inst)) return true; // We can get through binary operator, if it is legal. In other words, the // binary operator must have a nuw or nsw flag. const BinaryOperator *BinOp = dyn_cast(Inst); if (BinOp && isa(BinOp) && ((!IsSExt && BinOp->hasNoUnsignedWrap()) || (IsSExt && BinOp->hasNoSignedWrap()))) return true; // Check if we can do the following simplification. // ext(trunc(opnd)) --> ext(opnd) if (!isa(Inst)) return false; Value *OpndVal = Inst->getOperand(0); // Check if we can use this operand in the extension. // If the type is larger than the result type of the extension, we cannot. if (!OpndVal->getType()->isIntegerTy() || OpndVal->getType()->getIntegerBitWidth() > ConsideredExtType->getIntegerBitWidth()) return false; // If the operand of the truncate is not an instruction, we will not have // any information on the dropped bits. // (Actually we could for constant but it is not worth the extra logic). Instruction *Opnd = dyn_cast(OpndVal); if (!Opnd) return false; // Check if the source of the type is narrow enough. // I.e., check that trunc just drops extended bits of the same kind of // the extension. // #1 get the type of the operand and check the kind of the extended bits. const Type *OpndType; InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd); if (It != PromotedInsts.end() && It->second.getInt() == IsSExt) OpndType = It->second.getPointer(); else if ((IsSExt && isa(Opnd)) || (!IsSExt && isa(Opnd))) OpndType = Opnd->getOperand(0)->getType(); else return false; // #2 check that the truncate just drops extended bits. return Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth(); } TypePromotionHelper::Action TypePromotionHelper::getAction( Instruction *Ext, const SetOfInstrs &InsertedInsts, const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) { assert((isa(Ext) || isa(Ext)) && "Unexpected instruction type"); Instruction *ExtOpnd = dyn_cast(Ext->getOperand(0)); Type *ExtTy = Ext->getType(); bool IsSExt = isa(Ext); // If the operand of the extension is not an instruction, we cannot // get through. // If it, check we can get through. if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt)) return nullptr; // Do not promote if the operand has been added by codegenprepare. // Otherwise, it means we are undoing an optimization that is likely to be // redone, thus causing potential infinite loop. if (isa(ExtOpnd) && InsertedInsts.count(ExtOpnd)) return nullptr; // SExt or Trunc instructions. // Return the related handler. if (isa(ExtOpnd) || isa(ExtOpnd) || isa(ExtOpnd)) return promoteOperandForTruncAndAnyExt; // Regular instruction. // Abort early if we will have to insert non-free instructions. if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType())) return nullptr; return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther; } Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( llvm::Instruction *SExt, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI) { // By construction, the operand of SExt is an instruction. Otherwise we cannot // get through it and this method should not be called. Instruction *SExtOpnd = cast(SExt->getOperand(0)); Value *ExtVal = SExt; bool HasMergedNonFreeExt = false; if (isa(SExtOpnd)) { // Replace s|zext(zext(opnd)) // => zext(opnd). HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd); Value *ZExt = TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType()); TPT.replaceAllUsesWith(SExt, ZExt); TPT.eraseInstruction(SExt); ExtVal = ZExt; } else { // Replace z|sext(trunc(opnd)) or sext(sext(opnd)) // => z|sext(opnd). TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0)); } CreatedInstsCost = 0; // Remove dead code. if (SExtOpnd->use_empty()) TPT.eraseInstruction(SExtOpnd); // Check if the extension is still needed. Instruction *ExtInst = dyn_cast(ExtVal); if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) { if (ExtInst) { if (Exts) Exts->push_back(ExtInst); CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt; } return ExtVal; } // At this point we have: ext ty opnd to ty. // Reassign the uses of ExtInst to the opnd and remove ExtInst. Value *NextVal = ExtInst->getOperand(0); TPT.eraseInstruction(ExtInst, NextVal); return NextVal; } Value *TypePromotionHelper::promoteOperandForOther( Instruction *Ext, TypePromotionTransaction &TPT, InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, SmallVectorImpl *Exts, SmallVectorImpl *Truncs, const TargetLowering &TLI, bool IsSExt) { // By construction, the operand of Ext is an instruction. Otherwise we cannot // get through it and this method should not be called. Instruction *ExtOpnd = cast(Ext->getOperand(0)); CreatedInstsCost = 0; if (!ExtOpnd->hasOneUse()) { // ExtOpnd will be promoted. // All its uses, but Ext, will need to use a truncated value of the // promoted version. // Create the truncate now. Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType()); if (Instruction *ITrunc = dyn_cast(Trunc)) { ITrunc->removeFromParent(); // Insert it just after the definition. ITrunc->insertAfter(ExtOpnd); if (Truncs) Truncs->push_back(ITrunc); } TPT.replaceAllUsesWith(ExtOpnd, Trunc); // Restore the operand of Ext (which has been replaced by the previous call // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext. TPT.setOperand(Ext, 0, ExtOpnd); } // Get through the Instruction: // 1. Update its type. // 2. Replace the uses of Ext by Inst. // 3. Extend each operand that needs to be extended. // Remember the original type of the instruction before promotion. // This is useful to know that the high bits are sign extended bits. PromotedInsts.insert(std::pair( ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt))); // Step #1. TPT.mutateType(ExtOpnd, Ext->getType()); // Step #2. TPT.replaceAllUsesWith(Ext, ExtOpnd); // Step #3. Instruction *ExtForOpnd = Ext; DEBUG(dbgs() << "Propagate Ext to operands\n"); for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx; ++OpIdx) { DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n'); if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() || !shouldExtOperand(ExtOpnd, OpIdx)) { DEBUG(dbgs() << "No need to propagate\n"); continue; } // Check if we can statically extend the operand. Value *Opnd = ExtOpnd->getOperand(OpIdx); if (const ConstantInt *Cst = dyn_cast(Opnd)) { DEBUG(dbgs() << "Statically extend\n"); unsigned BitWidth = Ext->getType()->getIntegerBitWidth(); APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth) : Cst->getValue().zext(BitWidth); TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal)); continue; } // UndefValue are typed, so we have to statically sign extend them. if (isa(Opnd)) { DEBUG(dbgs() << "Statically extend\n"); TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType())); continue; } // Otherwise we have to explicity sign extend the operand. // Check if Ext was reused to extend an operand. if (!ExtForOpnd) { // If yes, create a new one. DEBUG(dbgs() << "More operands to ext\n"); Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) : TPT.createZExt(Ext, Opnd, Ext->getType()); if (!isa(ValForExtOpnd)) { TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd); continue; } ExtForOpnd = cast(ValForExtOpnd); } if (Exts) Exts->push_back(ExtForOpnd); TPT.setOperand(ExtForOpnd, 0, Opnd); // Move the sign extension before the insertion point. TPT.moveBefore(ExtForOpnd, ExtOpnd); TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd); CreatedInstsCost += !TLI.isExtFree(ExtForOpnd); // If more sext are required, new instructions will have to be created. ExtForOpnd = nullptr; } if (ExtForOpnd == Ext) { DEBUG(dbgs() << "Extension is useless now\n"); TPT.eraseInstruction(Ext); } return ExtOpnd; } /// Check whether or not promoting an instruction to a wider type is profitable. /// \p NewCost gives the cost of extension instructions created by the /// promotion. /// \p OldCost gives the cost of extension instructions before the promotion /// plus the number of instructions that have been /// matched in the addressing mode the promotion. /// \p PromotedOperand is the value that has been promoted. /// \return True if the promotion is profitable, false otherwise. bool AddressingModeMatcher::isPromotionProfitable( unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const { DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n'); // The cost of the new extensions is greater than the cost of the // old extension plus what we folded. // This is not profitable. if (NewCost > OldCost) return false; if (NewCost < OldCost) return true; // The promotion is neutral but it may help folding the sign extension in // loads for instance. // Check that we did not create an illegal instruction. return isPromotedInstructionLegal(TLI, DL, PromotedOperand); } /// Given an instruction or constant expr, see if we can fold the operation /// into the addressing mode. If so, update the addressing mode and return /// true, otherwise return false without modifying AddrMode. /// If \p MovedAway is not NULL, it contains the information of whether or /// not AddrInst has to be folded into the addressing mode on success. /// If \p MovedAway == true, \p AddrInst will not be part of the addressing /// because it has been moved away. /// Thus AddrInst must not be added in the matched instructions. /// This state can happen when AddrInst is a sext, since it may be moved away. /// Therefore, AddrInst may not be valid when MovedAway is true and it must /// not be referenced anymore. bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth, bool *MovedAway) { // Avoid exponential behavior on extremely deep expression trees. if (Depth >= 5) return false; // By default, all matched instructions stay in place. if (MovedAway) *MovedAway = false; switch (Opcode) { case Instruction::PtrToInt: // PtrToInt is always a noop, as we know that the int type is pointer sized. return matchAddr(AddrInst->getOperand(0), Depth); case Instruction::IntToPtr: { auto AS = AddrInst->getType()->getPointerAddressSpace(); auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); // This inttoptr is a no-op if the integer type is pointer sized. if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy) return matchAddr(AddrInst->getOperand(0), Depth); return false; } case Instruction::BitCast: // BitCast is always a noop, and we can handle it as long as it is // int->int or pointer->pointer (we don't want int<->fp or something). if ((AddrInst->getOperand(0)->getType()->isPointerTy() || AddrInst->getOperand(0)->getType()->isIntegerTy()) && // Don't touch identity bitcasts. These were probably put here by LSR, // and we don't want to mess around with them. Assume it knows what it // is doing. AddrInst->getOperand(0)->getType() != AddrInst->getType()) return matchAddr(AddrInst->getOperand(0), Depth); return false; case Instruction::AddrSpaceCast: { unsigned SrcAS = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) return matchAddr(AddrInst->getOperand(0), Depth); return false; } case Instruction::Add: { // Check to see if we can merge in the RHS then the LHS. If so, we win. ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); // Start a transaction at this point. // The LHS may match but not the RHS. // Therefore, we need a higher level restoration point to undo partially // matched operation. TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); if (matchAddr(AddrInst->getOperand(1), Depth+1) && matchAddr(AddrInst->getOperand(0), Depth+1)) return true; // Restore the old addr mode info. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); TPT.rollback(LastKnownGood); // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. if (matchAddr(AddrInst->getOperand(0), Depth+1) && matchAddr(AddrInst->getOperand(1), Depth+1)) return true; // Otherwise we definitely can't merge the ADD in. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); TPT.rollback(LastKnownGood); break; } //case Instruction::Or: // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. //break; case Instruction::Mul: case Instruction::Shl: { // Can only handle X*C and X << C. ConstantInt *RHS = dyn_cast(AddrInst->getOperand(1)); if (!RHS) return false; int64_t Scale = RHS->getSExtValue(); if (Opcode == Instruction::Shl) Scale = 1LL << Scale; return matchScaledValue(AddrInst->getOperand(0), Scale, Depth); } case Instruction::GetElementPtr: { // Scan the GEP. We check it if it contains constant offsets and at most // one variable offset. int VariableOperand = -1; unsigned VariableScale = 0; int64_t ConstantOffset = 0; gep_type_iterator GTI = gep_type_begin(AddrInst); for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { if (StructType *STy = dyn_cast(*GTI)) { const StructLayout *SL = DL.getStructLayout(STy); unsigned Idx = cast(AddrInst->getOperand(i))->getZExtValue(); ConstantOffset += SL->getElementOffset(Idx); } else { uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType()); if (ConstantInt *CI = dyn_cast(AddrInst->getOperand(i))) { ConstantOffset += CI->getSExtValue()*TypeSize; } else if (TypeSize) { // Scales of zero don't do anything. // We only allow one variable index at the moment. if (VariableOperand != -1) return false; // Remember the variable index. VariableOperand = i; VariableScale = TypeSize; } } } // A common case is for the GEP to only do a constant offset. In this case, // just add it to the disp field and check validity. if (VariableOperand == -1) { AddrMode.BaseOffs += ConstantOffset; if (ConstantOffset == 0 || TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) { // Check to see if we can fold the base pointer in too. if (matchAddr(AddrInst->getOperand(0), Depth+1)) return true; } AddrMode.BaseOffs -= ConstantOffset; return false; } // Save the valid addressing mode in case we can't match. ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); // See if the scale and offset amount is valid for this target. AddrMode.BaseOffs += ConstantOffset; // Match the base operand of the GEP. if (!matchAddr(AddrInst->getOperand(0), Depth+1)) { // If it couldn't be matched, just stuff the value in a register. if (AddrMode.HasBaseReg) { AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); return false; } AddrMode.HasBaseReg = true; AddrMode.BaseReg = AddrInst->getOperand(0); } // Match the remaining variable portion of the GEP. if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, Depth)) { // If it couldn't be matched, try stuffing the base into a register // instead of matching it, and retrying the match of the scale. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); if (AddrMode.HasBaseReg) return false; AddrMode.HasBaseReg = true; AddrMode.BaseReg = AddrInst->getOperand(0); AddrMode.BaseOffs += ConstantOffset; if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, Depth)) { // If even that didn't work, bail. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); return false; } } return true; } case Instruction::SExt: case Instruction::ZExt: { Instruction *Ext = dyn_cast(AddrInst); if (!Ext) return false; // Try to move this ext out of the way of the addressing mode. // Ask for a method for doing so. TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts); if (!TPH) return false; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); unsigned CreatedInstsCost = 0; unsigned ExtCost = !TLI.isExtFree(Ext); Value *PromotedOperand = TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI); // SExt has been moved away. // Thus either it will be rematched later in the recursive calls or it is // gone. Anyway, we must not fold it into the addressing mode at this point. // E.g., // op = add opnd, 1 // idx = ext op // addr = gep base, idx // is now: // promotedOpnd = ext opnd <- no match here // op = promoted_add promotedOpnd, 1 <- match (later in recursive calls) // addr = gep base, op <- match if (MovedAway) *MovedAway = true; assert(PromotedOperand && "TypePromotionHelper should have filtered out those cases"); ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); if (!matchAddr(PromotedOperand, Depth) || // The total of the new cost is equal to the cost of the created // instructions. // The total of the old cost is equal to the cost of the extension plus // what we have saved in the addressing mode. !isPromotionProfitable(CreatedInstsCost, ExtCost + (AddrModeInsts.size() - OldSize), PromotedOperand)) { AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); DEBUG(dbgs() << "Sign extension does not pay off: rollback\n"); TPT.rollback(LastKnownGood); return false; } return true; } } return false; } /// If we can, try to add the value of 'Addr' into the current addressing mode. /// If Addr can't be added to AddrMode this returns false and leaves AddrMode /// unmodified. This assumes that Addr is either a pointer type or intptr_t /// for the target. /// bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { // Start a transaction at this point that we will rollback if the matching // fails. TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); if (ConstantInt *CI = dyn_cast(Addr)) { // Fold in immediates if legal for the target. AddrMode.BaseOffs += CI->getSExtValue(); if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.BaseOffs -= CI->getSExtValue(); } else if (GlobalValue *GV = dyn_cast(Addr)) { // If this is a global variable, try to fold it into the addressing mode. if (!AddrMode.BaseGV) { AddrMode.BaseGV = GV; if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.BaseGV = nullptr; } } else if (Instruction *I = dyn_cast(Addr)) { ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); // Check to see if it is possible to fold this operation. bool MovedAway = false; if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { // This instruction may have been moved away. If so, there is nothing // to check here. if (MovedAway) return true; // Okay, it's possible to fold this. Check to see if it is actually // *profitable* to do so. We use a simple cost model to avoid increasing // register pressure too much. if (I->hasOneUse() || isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { AddrModeInsts.push_back(I); return true; } // It isn't profitable to do this, roll back. //cerr << "NOT FOLDING: " << *I; AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); TPT.rollback(LastKnownGood); } } else if (ConstantExpr *CE = dyn_cast(Addr)) { if (matchOperationAddr(CE, CE->getOpcode(), Depth)) return true; TPT.rollback(LastKnownGood); } else if (isa(Addr)) { // Null pointer gets folded without affecting the addressing mode. return true; } // Worse case, the target should support [reg] addressing modes. :) if (!AddrMode.HasBaseReg) { AddrMode.HasBaseReg = true; AddrMode.BaseReg = Addr; // Still check for legality in case the target supports [imm] but not [i+r]. if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.HasBaseReg = false; AddrMode.BaseReg = nullptr; } // If the base register is already taken, see if we can do [r+r]. if (AddrMode.Scale == 0) { AddrMode.Scale = 1; AddrMode.ScaledReg = Addr; if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) return true; AddrMode.Scale = 0; AddrMode.ScaledReg = nullptr; } // Couldn't match. TPT.rollback(LastKnownGood); return false; } /// Check to see if all uses of OpVal by the specified inline asm call are due /// to memory operands. If so, return true, otherwise return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, const TargetMachine &TM) { const Function *F = CI->getParent()->getParent(); const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, ImmutableCallSite(CI)); for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. TLI->ComputeConstraintToUse(OpInfo, SDValue()); // If this asm operand is our Value*, and if it isn't an indirect memory // operand, we can't fold it! if (OpInfo.CallOperandVal == OpVal && (OpInfo.ConstraintType != TargetLowering::C_Memory || !OpInfo.isIndirect)) return false; } return true; } /// Recursively walk all the uses of I until we find a memory use. /// If we find an obviously non-foldable instruction, return true. /// Add the ultimately found memory instructions to MemoryUses. static bool FindAllMemoryUses( Instruction *I, SmallVectorImpl> &MemoryUses, SmallPtrSetImpl &ConsideredInsts, const TargetMachine &TM) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; // If this is an obviously unfoldable instruction, bail out. if (!MightBeFoldableInst(I)) return true; // Loop over all the uses, recursively processing them. for (Use &U : I->uses()) { Instruction *UserI = cast(U.getUser()); if (LoadInst *LI = dyn_cast(UserI)) { MemoryUses.push_back(std::make_pair(LI, U.getOperandNo())); continue; } if (StoreInst *SI = dyn_cast(UserI)) { unsigned opNo = U.getOperandNo(); if (opNo == 0) return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } if (CallInst *CI = dyn_cast(UserI)) { InlineAsm *IA = dyn_cast(CI->getCalledValue()); if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. if (!IsOperandAMemoryOperand(CI, IA, I, TM)) return true; continue; } if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) return true; } return false; } /// Return true if Val is already known to be live at the use site that we're /// folding it into. If so, there is no cost to include it in the addressing /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the /// instruction already. bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, Value *KnownLive2) { // If Val is either of the known-live values, we know it is live! if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2) return true; // All values other than instructions and arguments (e.g. constants) are live. if (!isa(Val) && !isa(Val)) return true; // If Val is a constant sized alloca in the entry block, it is live, this is // true because it is just a reference to the stack/frame pointer, which is // live for the whole function. if (AllocaInst *AI = dyn_cast(Val)) if (AI->isStaticAlloca()) return true; // Check to see if this value is already used in the memory instruction's // block. If so, it's already live into the block at the very least, so we // can reasonably fold it. return Val->isUsedInBasicBlock(MemoryInst->getParent()); } /// It is possible for the addressing mode of the machine to fold the specified /// instruction into a load or store that ultimately uses it. /// However, the specified instruction has multiple uses. /// Given this, it may actually increase register pressure to fold it /// into the load. For example, consider this code: /// /// X = ... /// Y = X+1 /// use(Y) -> nonload/store /// Z = Y+1 /// load Z /// /// In this case, Y has multiple uses, and can be folded into the load of Z /// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to /// be live at the use(Y) line. If we don't fold Y into load Z, we use one /// fewer register. Since Y can't be folded into "use(Y)" we don't increase the /// number of computations either. /// /// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If /// X was live across 'load Z' for other reasons, we actually *would* want to /// fold the addressing mode in the Z case. This would make Y die earlier. bool AddressingModeMatcher:: isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) { if (IgnoreProfitability) return true; // AMBefore is the addressing mode before this instruction was folded into it, // and AMAfter is the addressing mode after the instruction was folded. Get // the set of registers referenced by AMAfter and subtract out those // referenced by AMBefore: this is the set of values which folding in this // address extends the lifetime of. // // Note that there are only two potential values being referenced here, // BaseReg and ScaleReg (global addresses are always available, as are any // folded immediates). Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; // If the BaseReg or ScaledReg was referenced by the previous addrmode, their // lifetime wasn't extended by adding this instruction. if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) BaseReg = nullptr; if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) ScaledReg = nullptr; // If folding this instruction (and it's subexprs) didn't extend any live // ranges, we're ok with it. if (!BaseReg && !ScaledReg) return true; // If all uses of this instruction are ultimately load/store/inlineasm's, // check to see if their addressing modes will include this instruction. If // so, we can fold it into all uses, so it doesn't matter if it has multiple // uses. SmallVector, 16> MemoryUses; SmallPtrSet ConsideredInsts; if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of // computation involving only operations that could theoretically be folded // into a memory use, loop over each of these uses and see if they could // *actually* fold the instruction. SmallVector MatchedAddrModeInsts; for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { Instruction *User = MemoryUses[i].first; unsigned OpNo = MemoryUses[i].second; // Get the access type of this use. If the use isn't a pointer, we don't // know what it accesses. Value *Address = User->getOperand(OpNo); PointerType *AddrTy = dyn_cast(Address->getType()); if (!AddrTy) return false; Type *AddressAccessTy = AddrTy->getElementType(); unsigned AS = AddrTy->getAddressSpace(); // Do a match against the root of this address, ignoring profitability. This // will tell us if the addressing mode for the memory operation will // *actually* cover the shared instruction. ExtAddrMode Result; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); // The match was to check the profitability, the changes made are not // part of the original matcher. Therefore, they should be dropped // otherwise the original matcher will not present the right state. TPT.rollback(LastKnownGood); // If the match didn't cover I, then it won't be shared by it. if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), I) == MatchedAddrModeInsts.end()) return false; MatchedAddrModeInsts.clear(); } return true; } } // end anonymous namespace /// Return true if the specified values are defined in a /// different basic block than BB. static bool IsNonLocalValue(Value *V, BasicBlock *BB) { if (Instruction *I = dyn_cast(V)) return I->getParent() != BB; return false; } /// Load and Store Instructions often have addressing modes that can do /// significant amounts of computation. As such, instruction selection will try /// to get the load or store to do as much computation as possible for the /// program. The problem is that isel can only see within a single block. As /// such, we sink as much legal addressing mode work into the block as possible. /// /// This method is used to optimize both load/store and inline asms with memory /// operands. bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace) { Value *Repl = Addr; // Try to collapse single-value PHI nodes. This is necessary to undo // unprofitable PRE transformations. SmallVector worklist; SmallPtrSet Visited; worklist.push_back(Addr); // Use a worklist to iteratively look through PHI nodes, and ensure that // the addressing mode obtained from the non-PHI roots of the graph // are equivalent. Value *Consensus = nullptr; unsigned NumUsesConsensus = 0; bool IsNumUsesConsensusValid = false; SmallVector AddrModeInsts; ExtAddrMode AddrMode; TypePromotionTransaction TPT; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); while (!worklist.empty()) { Value *V = worklist.back(); worklist.pop_back(); // Break use-def graph loops. if (!Visited.insert(V).second) { Consensus = nullptr; break; } // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast(V)) { for (Value *IncValue : P->incoming_values()) worklist.push_back(IncValue); continue; } // For non-PHIs, determine the addressing mode being computed. SmallVector NewAddrModeInsts; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, InsertedInsts, PromotedInsts, TPT); // This check is broken into two cases with very similar code to avoid using // getNumUses() as much as possible. Some values have a lot of uses, so // calling getNumUses() unconditionally caused a significant compile-time // regression. if (!Consensus) { Consensus = V; AddrMode = NewAddrMode; AddrModeInsts = NewAddrModeInsts; continue; } else if (NewAddrMode == AddrMode) { if (!IsNumUsesConsensusValid) { NumUsesConsensus = Consensus->getNumUses(); IsNumUsesConsensusValid = true; } // Ensure that the obtained addressing mode is equivalent to that obtained // for all other roots of the PHI traversal. Also, when choosing one // such root as representative, select the one with the most uses in order // to keep the cost modeling heuristics in AddressingModeMatcher // applicable. unsigned NumUses = V->getNumUses(); if (NumUses > NumUsesConsensus) { Consensus = V; NumUsesConsensus = NumUses; AddrModeInsts = NewAddrModeInsts; } continue; } Consensus = nullptr; break; } // If the addressing mode couldn't be determined, or if multiple different // ones were determined, bail out now. if (!Consensus) { TPT.rollback(LastKnownGood); return false; } TPT.commit(); // Check to see if any of the instructions supersumed by this addr mode are // non-local to I's BB. bool AnyNonLocal = false; for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) { if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) { AnyNonLocal = true; break; } } // If all the instructions matched are already in this BB, don't do anything. if (!AnyNonLocal) { DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); return false; } // Insert this computation right after this user. Since our caller is // scanning from the top of the BB to the bottom, reuse of the expr are // guaranteed to happen later. IRBuilder<> Builder(MemoryInst); // Now that we determined the addressing expression we want to use and know // that we have to sink it into this block. Check to see if we have already // done this for some other load/store instr in this block. If so, reuse the // computation. Value *&SunkAddr = SunkAddrs[Addr]; if (SunkAddr) { DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); if (SunkAddr->getType() != Addr->getType()) SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && TM && TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) ->useAA())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); Value *ResultPtr = nullptr, *ResultIndex = nullptr; // First, find the pointer. if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) { ResultPtr = AddrMode.BaseReg; AddrMode.BaseReg = nullptr; } if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) { // We can't add more than one pointer together, nor can we scale a // pointer (both of which seem meaningless). if (ResultPtr || AddrMode.Scale != 1) return false; ResultPtr = AddrMode.ScaledReg; AddrMode.Scale = 0; } if (AddrMode.BaseGV) { if (ResultPtr) return false; ResultPtr = AddrMode.BaseGV; } // If the real base value actually came from an inttoptr, then the matcher // will look through it and provide only the integer value. In that case, // use it here. if (!ResultPtr && AddrMode.BaseReg) { ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr"); AddrMode.BaseReg = nullptr; } else if (!ResultPtr && AddrMode.Scale == 1) { ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr"); AddrMode.Scale = 0; } if (!ResultPtr && !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) { SunkAddr = Constant::getNullValue(Addr->getType()); } else if (!ResultPtr) { return false; } else { Type *I8PtrTy = Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); Type *I8Ty = Builder.getInt8Ty(); // Start with the base register. Do this first so that subsequent address // matching finds it last, which will prevent it from trying to match it // as the scaled value in case it happens to be a mul. That would be // problematic if we've sunk a different mul for the scale, because then // we'd end up sinking both muls. if (AddrMode.BaseReg) { Value *V = AddrMode.BaseReg; if (V->getType() != IntPtrTy) V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); ResultIndex = V; } // Add the scale value. if (AddrMode.Scale) { Value *V = AddrMode.ScaledReg; if (V->getType() == IntPtrTy) { // done. } else if (cast(IntPtrTy)->getBitWidth() < cast(V->getType())->getBitWidth()) { V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); } else { // It is only safe to sign extend the BaseReg if we know that the math // required to create it did not overflow before we extend it. Since // the original IR value was tossed in favor of a constant back when // the AddrMode was created we need to bail out gracefully if widths // do not match instead of extending it. Instruction *I = dyn_cast_or_null(ResultIndex); if (I && (ResultIndex != AddrMode.BaseReg)) I->eraseFromParent(); return false; } if (AddrMode.Scale != 1) V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), "sunkaddr"); if (ResultIndex) ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr"); else ResultIndex = V; } // Add in the Base Offset if present. if (AddrMode.BaseOffs) { Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); if (ResultIndex) { // We need to add this separately from the scale above to help with // SDAG consecutive load/store merging. if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } ResultIndex = V; } if (!ResultIndex) { SunkAddr = ResultPtr; } else { if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } if (SunkAddr->getType() != Addr->getType()) SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); } } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); Value *Result = nullptr; // Start with the base register. Do this first so that subsequent address // matching finds it last, which will prevent it from trying to match it // as the scaled value in case it happens to be a mul. That would be // problematic if we've sunk a different mul for the scale, because then // we'd end up sinking both muls. if (AddrMode.BaseReg) { Value *V = AddrMode.BaseReg; if (V->getType()->isPointerTy()) V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); if (V->getType() != IntPtrTy) V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); Result = V; } // Add the scale value. if (AddrMode.Scale) { Value *V = AddrMode.ScaledReg; if (V->getType() == IntPtrTy) { // done. } else if (V->getType()->isPointerTy()) { V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); } else if (cast(IntPtrTy)->getBitWidth() < cast(V->getType())->getBitWidth()) { V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); } else { // It is only safe to sign extend the BaseReg if we know that the math // required to create it did not overflow before we extend it. Since // the original IR value was tossed in favor of a constant back when // the AddrMode was created we need to bail out gracefully if widths // do not match instead of extending it. Instruction *I = dyn_cast_or_null(Result); if (I && (Result != AddrMode.BaseReg)) I->eraseFromParent(); return false; } if (AddrMode.Scale != 1) V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), "sunkaddr"); if (Result) Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } // Add in the BaseGV if present. if (AddrMode.BaseGV) { Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); if (Result) Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } // Add in the Base Offset if present. if (AddrMode.BaseOffs) { Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); if (Result) Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } if (!Result) SunkAddr = Constant::getNullValue(Addr->getType()); else SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); } MemoryInst->replaceUsesOfWith(Repl, SunkAddr); // If we have no uses, recursively delete the value and all dead instructions // using it. if (Repl->use_empty()) { // This can cause recursive deletion, which can invalidate our iterator. // Use a WeakVH to hold onto it in case this happens. WeakVH IterHandle(&*CurInstIterator); BasicBlock *BB = CurInstIterator->getParent(); RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); if (IterHandle != CurInstIterator.getNodePtrUnchecked()) { // If the iterator instruction was recursively deleted, start over at the // start of the block. CurInstIterator = BB->begin(); SunkAddrs.clear(); } } ++NumMemoryInsts; return true; } /// If there are any memory operands, use OptimizeMemoryInst to sink their /// address computing into the block when possible / profitable. bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; const TargetRegisterInfo *TRI = TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = TLI->ParseConstraints(*DL, TRI, CS); unsigned ArgNo = 0; for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. TLI->ComputeConstraintToUse(OpInfo, SDValue()); if (OpInfo.ConstraintType == TargetLowering::C_Memory && OpInfo.isIndirect) { Value *OpVal = CS->getArgOperand(ArgNo++); MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); } else if (OpInfo.Type == InlineAsm::isInput) ArgNo++; } return MadeChange; } /// \brief Check if all the uses of \p Inst are equivalent (or free) zero or /// sign extensions. static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { assert(!Inst->use_empty() && "Input must have at least one use"); const Instruction *FirstUser = cast(*Inst->user_begin()); bool IsSExt = isa(FirstUser); Type *ExtTy = FirstUser->getType(); for (const User *U : Inst->users()) { const Instruction *UI = cast(U); if ((IsSExt && !isa(UI)) || (!IsSExt && !isa(UI))) return false; Type *CurTy = UI->getType(); // Same input and output types: Same instruction after CSE. if (CurTy == ExtTy) continue; // If IsSExt is true, we are in this situation: // a = Inst // b = sext ty1 a to ty2 // c = sext ty1 a to ty3 // Assuming ty2 is shorter than ty3, this could be turned into: // a = Inst // b = sext ty1 a to ty2 // c = sext ty2 b to ty3 // However, the last sext is not free. if (IsSExt) return false; // This is a ZExt, maybe this is free to extend from one type to another. // In that case, we would not account for a different use. Type *NarrowTy; Type *LargeTy; if (ExtTy->getScalarType()->getIntegerBitWidth() > CurTy->getScalarType()->getIntegerBitWidth()) { NarrowTy = CurTy; LargeTy = ExtTy; } else { NarrowTy = ExtTy; LargeTy = CurTy; } if (!TLI.isZExtFree(NarrowTy, LargeTy)) return false; } // All uses are the same or can be derived from one another for free. return true; } /// \brief Try to form ExtLd by promoting \p Exts until they reach a /// load instruction. /// If an ext(load) can be formed, it is returned via \p LI for the load /// and \p Inst for the extension. /// Otherwise LI == nullptr and Inst == nullptr. /// When some promotion happened, \p TPT contains the proper state to /// revert them. /// /// \return true when promoting was necessary to expose the ext(load) /// opportunity, false otherwise. /// /// Example: /// \code /// %ld = load i32* %addr /// %add = add nuw i32 %ld, 4 /// %zext = zext i32 %add to i64 /// \endcode /// => /// \code /// %ld = load i32* %addr /// %zext = zext i32 %ld to i64 /// %add = add nuw i64 %zext, 4 /// \encode /// Thanks to the promotion, we can match zext(load i32*) to i64. bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, Instruction *&Inst, const SmallVectorImpl &Exts, unsigned CreatedInstsCost = 0) { // Iterate over all the extensions to see if one form an ext(load). for (auto I : Exts) { // Check if we directly have ext(load). if ((LI = dyn_cast(I->getOperand(0)))) { Inst = I; // No promotion happened here. return false; } // Check whether or not we want to do any promotion. if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) continue; // Get the action to perform the promotion. TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( I, InsertedInsts, *TLI, PromotedInsts); // Check if we can promote. if (!TPH) continue; // Save the current state. TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); SmallVector NewExts; unsigned NewCreatedInstsCost = 0; unsigned ExtCost = !TLI->isExtFree(I); // Promote. Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost, &NewExts, nullptr, *TLI); assert(PromotedVal && "TypePromotionHelper should have filtered out those cases"); // We would be able to merge only one extension in a load. // Therefore, if we have more than 1 new extension we heuristically // cut this search path, because it means we degrade the code quality. // With exactly 2, the transformation is neutral, because we will merge // one extension but leave one. However, we optimistically keep going, // because the new extension may be removed too. long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; TotalCreatedInstsCost -= ExtCost; if (!StressExtLdPromotion && (TotalCreatedInstsCost > 1 || !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { // The promotion is not profitable, rollback to the previous state. TPT.rollback(LastKnownGood); continue; } // The promotion is profitable. // Check if it exposes an ext(load). (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || // If we have created a new extension, i.e., now we have two // extensions. We must make sure one of them is merged with // the load, otherwise we may degrade the code quality. (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) // Promotion happened. return true; // If this does not help to expose an ext(load) then, rollback. TPT.rollback(LastKnownGood); } // None of the extension can form an ext(load). LI = nullptr; Inst = nullptr; return false; } /// Move a zext or sext fed by a load into the same basic block as the load, /// unless conditions are unfavorable. This allows SelectionDAG to fold the /// extend into the load. /// \p I[in/out] the extension may be modified during the process if some /// promotions apply. /// bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { // Try to promote a chain of computation if it allows to form // an extended load. TypePromotionTransaction TPT; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); SmallVector Exts; Exts.push_back(I); // Look for a load being extended. LoadInst *LI = nullptr; Instruction *OldExt = I; bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); if (!LI || !I) { assert(!HasPromoted && !LI && "If we did not match any load instruction " "the code must remain the same"); I = OldExt; return false; } // If they're already in the same block, there's nothing to do. // Make the cheap checks first if we did not promote. // If we promoted, we need to check if it is indeed profitable. if (!HasPromoted && LI->getParent() == I->getParent()) return false; EVT VT = TLI->getValueType(*DL, I->getType()); EVT LoadVT = TLI->getValueType(*DL, LI->getType()); // If the load has other users and the truncate is not free, this probably // isn't worthwhile. if (!LI->hasOneUse() && TLI && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && !TLI->isTruncateFree(I->getType(), LI->getType())) { I = OldExt; TPT.rollback(LastKnownGood); return false; } // Check whether the target supports casts folded into loads. unsigned LType; if (isa(I)) LType = ISD::ZEXTLOAD; else { assert(isa(I) && "Unexpected ext type!"); LType = ISD::SEXTLOAD; } if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) { I = OldExt; TPT.rollback(LastKnownGood); return false; } // Move the extend into the same block as the load, so that SelectionDAG // can fold it. TPT.commit(); I->removeFromParent(); I->insertAfter(LI); ++NumExtsMoved; return true; } bool CodeGenPrepare::optimizeExtUses(Instruction *I) { BasicBlock *DefBB = I->getParent(); // If the result of a {s|z}ext and its source are both live out, rewrite all // other uses of the source with result of extension. Value *Src = I->getOperand(0); if (Src->hasOneUse()) return false; // Only do this xform if truncating is free. if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) return false; // Only safe to perform the optimization if the source is also defined in // this block. if (!isa(Src) || DefBB != cast(Src)->getParent()) return false; bool DefIsLiveOut = false; for (User *U : I->users()) { Instruction *UI = cast(U); // Figure out which BB this ext is used in. BasicBlock *UserBB = UI->getParent(); if (UserBB == DefBB) continue; DefIsLiveOut = true; break; } if (!DefIsLiveOut) return false; // Make sure none of the uses are PHI nodes. for (User *U : Src->users()) { Instruction *UI = cast(U); BasicBlock *UserBB = UI->getParent(); if (UserBB == DefBB) continue; // Be conservative. We don't want this xform to end up introducing // reloads just before load / store instructions. if (isa(UI) || isa(UI) || isa(UI)) return false; } // InsertedTruncs - Only insert one trunc in each block once. DenseMap InsertedTruncs; bool MadeChange = false; for (Use &U : Src->uses()) { Instruction *User = cast(U.getUser()); // Figure out which BB this ext is used in. BasicBlock *UserBB = User->getParent(); if (UserBB == DefBB) continue; // Both src and def are live in this block. Rewrite the use. Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; if (!InsertedTrunc) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); assert(InsertPt != UserBB->end()); InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt); InsertedInsts.insert(InsertedTrunc); } // Replace a use of the {s|z}ext source with a use of the result. U = InsertedTrunc; ++NumExtUses; MadeChange = true; } return MadeChange; } // Find loads whose uses only use some of the loaded value's bits. Add an "and" // just after the load if the target can fold this into one extload instruction, // with the hope of eliminating some of the other later "and" instructions using // the loaded value. "and"s that are made trivially redundant by the insertion // of the new "and" are removed by this function, while others (e.g. those whose // path from the load goes through a phi) are left for isel to potentially // remove. // // For example: // // b0: // x = load i32 // ... // b1: // y = and x, 0xff // z = use y // // becomes: // // b0: // x = load i32 // x' = and x, 0xff // ... // b1: // z = use x' // // whereas: // // b0: // x1 = load i32 // ... // b1: // x2 = load i32 // ... // b2: // x = phi x1, x2 // y = and x, 0xff // // becomes (after a call to optimizeLoadExt for each load): // // b0: // x1 = load i32 // x1' = and x1, 0xff // ... // b1: // x2 = load i32 // x2' = and x2, 0xff // ... // b2: // x = phi x1', x2' // y = and x, 0xff // bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { if (!Load->isSimple() || !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) return false; // Skip loads we've already transformed or have no reason to transform. if (Load->hasOneUse()) { User *LoadUser = *Load->user_begin(); if (cast(LoadUser)->getParent() == Load->getParent() && !dyn_cast(LoadUser)) return false; } // Look at all uses of Load, looking through phis, to determine how many bits // of the loaded value are needed. SmallVector WorkList; SmallPtrSet Visited; SmallVector AndsToMaybeRemove; for (auto *U : Load->users()) WorkList.push_back(cast(U)); EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); unsigned BitWidth = LoadResultVT.getSizeInBits(); APInt DemandBits(BitWidth, 0); APInt WidestAndBits(BitWidth, 0); while (!WorkList.empty()) { Instruction *I = WorkList.back(); WorkList.pop_back(); // Break use-def graph loops. if (!Visited.insert(I).second) continue; // For a PHI node, push all of its users. if (auto *Phi = dyn_cast(I)) { for (auto *U : Phi->users()) WorkList.push_back(cast(U)); continue; } switch (I->getOpcode()) { case llvm::Instruction::And: { auto *AndC = dyn_cast(I->getOperand(1)); if (!AndC) return false; APInt AndBits = AndC->getValue(); DemandBits |= AndBits; // Keep track of the widest and mask we see. if (AndBits.ugt(WidestAndBits)) WidestAndBits = AndBits; if (AndBits == WidestAndBits && I->getOperand(0) == Load) AndsToMaybeRemove.push_back(I); break; } case llvm::Instruction::Shl: { auto *ShlC = dyn_cast(I->getOperand(1)); if (!ShlC) return false; uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); DemandBits |= ShlDemandBits; break; } case llvm::Instruction::Trunc: { EVT TruncVT = TLI->getValueType(*DL, I->getType()); unsigned TruncBitWidth = TruncVT.getSizeInBits(); auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); DemandBits |= TruncBits; break; } default: return false; } } uint32_t ActiveBits = DemandBits.getActiveBits(); // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but // (and (load x) 1) is not matched as a single instruction, rather as a LDR // followed by an AND. // TODO: Look into removing this restriction by fixing backends to either // return false for isLoadExtLegal for i1 or have them select this pattern to // a single instruction. // // Also avoid hoisting if we didn't see any ands with the exact DemandBits // mask, since these are the only ands that will be removed by isel. if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || WidestAndBits != DemandBits) return false; LLVMContext &Ctx = Load->getType()->getContext(); Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); EVT TruncVT = TLI->getValueType(*DL, TruncTy); // Reject cases that won't be matched as extloads. if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) return false; IRBuilder<> Builder(Load->getNextNode()); auto *NewAnd = dyn_cast( Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); // Replace all uses of load with new and (except for the use of load in the // new and itself). Load->replaceAllUsesWith(NewAnd); NewAnd->setOperand(0, Load); // Remove any and instructions that are now redundant. for (auto *And : AndsToMaybeRemove) // Check that the and mask is the same as the one we decided to put on the // new and. if (cast(And->getOperand(1))->getValue() == DemandBits) { And->replaceAllUsesWith(NewAnd); if (&*CurInstIterator == And) CurInstIterator = std::next(And->getIterator()); And->eraseFromParent(); ++NumAndUses; } ++NumAndsAdded; return true; } /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { auto *I = dyn_cast(V); // If it's safe to speculatively execute, then it should not have side // effects; therefore, it's safe to sink and possibly *not* execute. return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive; } /// Returns true if a SelectInst should be turned into an explicit branch. static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, SelectInst *SI) { // FIXME: This should use the same heuristics as IfConversion to determine // whether a select is better represented as a branch. This requires that // branch probability metadata is preserved for the select, which is not the // case currently. CmpInst *Cmp = dyn_cast(SI->getCondition()); // If a branch is predictable, an out-of-order CPU can avoid blocking on its // comparison condition. If the compare has more than one use, there's // probably another cmov or setcc around, so it's not worth emitting a branch. if (!Cmp || !Cmp->hasOneUse()) return false; Value *CmpOp0 = Cmp->getOperand(0); Value *CmpOp1 = Cmp->getOperand(1); // Emit "cmov on compare with a memory operand" as a branch to avoid stalls // on a load from memory. But if the load is used more than once, do not // change the select to a branch because the load is probably needed // regardless of whether the branch is taken or not. if ((isa(CmpOp0) && CmpOp0->hasOneUse()) || (isa(CmpOp1) && CmpOp1->hasOneUse())) return true; // If either operand of the select is expensive and only needed on one side // of the select, we should form a branch. if (sinkSelectOperand(TTI, SI->getTrueValue()) || sinkSelectOperand(TTI, SI->getFalseValue())) return true; return false; } /// If we have a SelectInst that will likely profit from branch prediction, /// turn it into a branch. bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); // Can we convert the 'select' to CF ? if (DisableSelectToBranch || OptSize || !TLI || VectorCond) return false; TargetLowering::SelectSupportKind SelectKind; if (VectorCond) SelectKind = TargetLowering::VectorMaskSelect; else if (SI->getType()->isVectorTy()) SelectKind = TargetLowering::ScalarCondVectorVal; else SelectKind = TargetLowering::ScalarValSelect; // Do we have efficient codegen support for this kind of 'selects' ? if (TLI->isSelectSupported(SelectKind)) { // We have efficient codegen support for the select instruction. // Check if it is profitable to keep this 'select'. if (!TLI->isPredictableSelectExpensive() || !isFormingBranchFromSelectProfitable(TTI, SI)) return false; } ModifiedDT = true; // Transform a sequence like this: // start: // %cmp = cmp uge i32 %a, %b // %sel = select i1 %cmp, i32 %c, i32 %d // // Into: // start: // %cmp = cmp uge i32 %a, %b // br i1 %cmp, label %select.true, label %select.false // select.true: // br label %select.end // select.false: // br label %select.end // select.end: // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] // // In addition, we may sink instructions that produce %c or %d from // the entry block into the destination(s) of the new branch. // If the true or false blocks do not contain a sunken instruction, that // block and its branch may be optimized away. In that case, one side of the // first branch will point directly to select.end, and the corresponding PHI // predecessor block will be the start block. // First, we split the block containing the select into 2 blocks. BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); // Delete the unconditional branch that was just created by the split. StartBlock->getTerminator()->eraseFromParent(); // These are the new basic blocks for the conditional branch. // At least one will become an actual new basic block. BasicBlock *TrueBlock = nullptr; BasicBlock *FalseBlock = nullptr; // Sink expensive instructions into the conditional blocks to avoid executing // them speculatively. if (sinkSelectOperand(TTI, SI->getTrueValue())) { TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", EndBlock->getParent(), EndBlock); auto *TrueBranch = BranchInst::Create(EndBlock, TrueBlock); auto *TrueInst = cast(SI->getTrueValue()); TrueInst->moveBefore(TrueBranch); } if (sinkSelectOperand(TTI, SI->getFalseValue())) { FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", EndBlock->getParent(), EndBlock); auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); auto *FalseInst = cast(SI->getFalseValue()); FalseInst->moveBefore(FalseBranch); } // If there was nothing to sink, then arbitrarily choose the 'false' side // for a new input value to the PHI. if (TrueBlock == FalseBlock) { assert(TrueBlock == nullptr && "Unexpected basic block transform while optimizing select"); FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", EndBlock->getParent(), EndBlock); BranchInst::Create(EndBlock, FalseBlock); } // Insert the real conditional branch based on the original condition. // If we did not create a new block for one of the 'true' or 'false' paths // of the condition, it means that side of the branch goes to the end block // directly and the path originates from the start block from the point of // view of the new PHI. if (TrueBlock == nullptr) { BranchInst::Create(EndBlock, FalseBlock, SI->getCondition(), SI); TrueBlock = StartBlock; } else if (FalseBlock == nullptr) { BranchInst::Create(TrueBlock, EndBlock, SI->getCondition(), SI); FalseBlock = StartBlock; } else { BranchInst::Create(TrueBlock, FalseBlock, SI->getCondition(), SI); } // The select itself is replaced with a PHI Node. PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); PN->takeName(SI); PN->addIncoming(SI->getTrueValue(), TrueBlock); PN->addIncoming(SI->getFalseValue(), FalseBlock); SI->replaceAllUsesWith(PN); SI->eraseFromParent(); // Instruct OptimizeBlock to skip to the next block. CurInstIterator = StartBlock->end(); ++NumSelectsExpanded; return true; } static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { SmallVector Mask(SVI->getShuffleMask()); int SplatElem = -1; for (unsigned i = 0; i < Mask.size(); ++i) { if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem) return false; SplatElem = Mask[i]; } return true; } /// Some targets have expensive vector shifts if the lanes aren't all the same /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases /// it's often worth sinking a shufflevector splat down to its use so that /// codegen can spot all lanes are identical. bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { BasicBlock *DefBB = SVI->getParent(); // Only do this xform if variable vector shifts are particularly expensive. if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType())) return false; // We only expect better codegen by sinking a shuffle if we can recognise a // constant splat. if (!isBroadcastShuffle(SVI)) return false; // InsertedShuffles - Only insert a shuffle in each block once. DenseMap InsertedShuffles; bool MadeChange = false; for (User *U : SVI->users()) { Instruction *UI = cast(U); // Figure out which BB this ext is used in. BasicBlock *UserBB = UI->getParent(); if (UserBB == DefBB) continue; // For now only apply this when the splat is used by a shift instruction. if (!UI->isShift()) continue; // Everything checks out, sink the shuffle if the user's block doesn't // already have a copy. Instruction *&InsertedShuffle = InsertedShuffles[UserBB]; if (!InsertedShuffle) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); assert(InsertPt != UserBB->end()); InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), SVI->getOperand(2), "", &*InsertPt); } UI->replaceUsesOfWith(SVI, InsertedShuffle); MadeChange = true; } // If we removed all uses, nuke the shuffle. if (SVI->use_empty()) { SVI->eraseFromParent(); MadeChange = true; } return MadeChange; } bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { if (!TLI || !DL) return false; Value *Cond = SI->getCondition(); Type *OldType = Cond->getType(); LLVMContext &Context = Cond->getContext(); MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); unsigned RegWidth = RegType.getSizeInBits(); if (RegWidth <= cast(OldType)->getBitWidth()) return false; // If the register width is greater than the type width, expand the condition // of the switch instruction and each case constant to the width of the // register. By widening the type of the switch condition, subsequent // comparisons (for case comparisons) will not need to be extended to the // preferred register width, so we will potentially eliminate N-1 extends, // where N is the number of cases in the switch. auto *NewType = Type::getIntNTy(Context, RegWidth); // Zero-extend the switch condition and case constants unless the switch // condition is a function argument that is already being sign-extended. // In that case, we can avoid an unnecessary mask/extension by sign-extending // everything instead. Instruction::CastOps ExtType = Instruction::ZExt; if (auto *Arg = dyn_cast(Cond)) if (Arg->hasSExtAttr()) ExtType = Instruction::SExt; auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); ExtInst->insertBefore(SI); SI->setCondition(ExtInst); for (SwitchInst::CaseIt Case : SI->cases()) { APInt NarrowConst = Case.getCaseValue()->getValue(); APInt WideConst = (ExtType == Instruction::ZExt) ? NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); Case.setValue(ConstantInt::get(Context, WideConst)); } return true; } namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. /// E.g., /// a = vector_op <2 x i32> /// b = extractelement <2 x i32> a, i32 0 /// c = scalar_op b /// store c /// /// => /// a = vector_op <2 x i32> /// c = vector_op a (equivalent to scalar_op on the related lane) /// * d = extractelement <2 x i32> c, i32 0 /// * store d /// Assuming both extractelement and store can be combine, we get rid of the /// transition. class VectorPromoteHelper { /// DataLayout associated with the current module. const DataLayout &DL; /// Used to perform some checks on the legality of vector operations. const TargetLowering &TLI; /// Used to estimated the cost of the promoted chain. const TargetTransformInfo &TTI; /// The transition being moved downwards. Instruction *Transition; /// The sequence of instructions to be promoted. SmallVector InstsToBePromoted; /// Cost of combining a store and an extract. unsigned StoreExtractCombineCost; /// Instruction that will be combined with the transition. Instruction *CombineInst; /// \brief The instruction that represents the current end of the transition. /// Since we are faking the promotion until we reach the end of the chain /// of computation, we need a way to get the current end of the transition. Instruction *getEndOfTransition() const { if (InstsToBePromoted.empty()) return Transition; return InstsToBePromoted.back(); } /// \brief Return the index of the original value in the transition. /// E.g., for "extractelement <2 x i32> c, i32 1" the original value, /// c, is at index 0. unsigned getTransitionOriginalValueIdx() const { assert(isa(Transition) && "Other kind of transitions are not supported yet"); return 0; } /// \brief Return the index of the index in the transition. /// E.g., for "extractelement <2 x i32> c, i32 0" the index /// is at index 1. unsigned getTransitionIdx() const { assert(isa(Transition) && "Other kind of transitions are not supported yet"); return 1; } /// \brief Get the type of the transition. /// This is the type of the original value. /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the /// transition is <2 x i32>. Type *getTransitionType() const { return Transition->getOperand(getTransitionOriginalValueIdx())->getType(); } /// \brief Promote \p ToBePromoted by moving \p Def downward through. /// I.e., we have the following sequence: /// Def = Transition a to /// b = ToBePromoted Def, ... /// => /// b = ToBePromoted a, ... /// Def = Transition ToBePromoted to void promoteImpl(Instruction *ToBePromoted); /// \brief Check whether or not it is profitable to promote all the /// instructions enqueued to be promoted. bool isProfitableToPromote() { Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx()); unsigned Index = isa(ValIdx) ? cast(ValIdx)->getZExtValue() : -1; Type *PromotedType = getTransitionType(); StoreInst *ST = cast(CombineInst); unsigned AS = ST->getPointerAddressSpace(); unsigned Align = ST->getAlignment(); // Check if this store is supported. if (!TLI.allowsMisalignedMemoryAccesses( TLI.getValueType(DL, ST->getValueOperand()->getType()), AS, Align)) { // If this is not supported, there is no way we can combine // the extract with the store. return false; } // The scalar chain of computation has to pay for the transition // scalar to vector. // The vector chain has to account for the combining cost. uint64_t ScalarCost = TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); uint64_t VectorCost = StoreExtractCombineCost; for (const auto &Inst : InstsToBePromoted) { // Compute the cost. // By construction, all instructions being promoted are arithmetic ones. // Moreover, one argument is a constant that can be viewed as a splat // constant. Value *Arg0 = Inst->getOperand(0); bool IsArg0Constant = isa(Arg0) || isa(Arg0) || isa(Arg0); TargetTransformInfo::OperandValueKind Arg0OVK = IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue : TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Arg1OVK = !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue : TargetTransformInfo::OK_AnyValue; ScalarCost += TTI.getArithmeticInstrCost( Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK); VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, Arg0OVK, Arg1OVK); } DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: " << ScalarCost << "\nVector: " << VectorCost << '\n'); return ScalarCost > VectorCost; } /// \brief Generate a constant vector with \p Val with the same /// number of elements as the transition. /// \p UseSplat defines whether or not \p Val should be replicated /// across the whole vector. /// In other words, if UseSplat == true, we generate , /// otherwise we generate a vector with as many undef as possible: /// where \p Val is only /// used at the index of the extract. Value *getConstantVector(Constant *Val, bool UseSplat) const { unsigned ExtractIdx = UINT_MAX; if (!UseSplat) { // If we cannot determine where the constant must be, we have to // use a splat constant. Value *ValExtractIdx = Transition->getOperand(getTransitionIdx()); if (ConstantInt *CstVal = dyn_cast(ValExtractIdx)) ExtractIdx = CstVal->getSExtValue(); else UseSplat = true; } unsigned End = getTransitionType()->getVectorNumElements(); if (UseSplat) return ConstantVector::getSplat(End, Val); SmallVector ConstVec; UndefValue *UndefVal = UndefValue::get(Val->getType()); for (unsigned Idx = 0; Idx != End; ++Idx) { if (Idx == ExtractIdx) ConstVec.push_back(Val); else ConstVec.push_back(UndefVal); } return ConstantVector::get(ConstVec); } /// \brief Check if promoting to a vector type an operand at \p OperandIdx /// in \p Use can trigger undefined behavior. static bool canCauseUndefinedBehavior(const Instruction *Use, unsigned OperandIdx) { // This is not safe to introduce undef when the operand is on // the right hand side of a division-like instruction. if (OperandIdx != 1) return false; switch (Use->getOpcode()) { default: return false; case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: case Instruction::URem: return true; case Instruction::FDiv: case Instruction::FRem: return !Use->hasNoNaNs(); } llvm_unreachable(nullptr); } public: VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI, const TargetTransformInfo &TTI, Instruction *Transition, unsigned CombineCost) : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition), StoreExtractCombineCost(CombineCost), CombineInst(nullptr) { assert(Transition && "Do not know how to promote null"); } /// \brief Check if we can promote \p ToBePromoted to \p Type. bool canPromote(const Instruction *ToBePromoted) const { // We could support CastInst too. return isa(ToBePromoted); } /// \brief Check if it is profitable to promote \p ToBePromoted /// by moving downward the transition through. bool shouldPromote(const Instruction *ToBePromoted) const { // Promote only if all the operands can be statically expanded. // Indeed, we do not want to introduce any new kind of transitions. for (const Use &U : ToBePromoted->operands()) { const Value *Val = U.get(); if (Val == getEndOfTransition()) { // If the use is a division and the transition is on the rhs, // we cannot promote the operation, otherwise we may create a // division by zero. if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())) return false; continue; } if (!isa(Val) && !isa(Val) && !isa(Val)) return false; } // Check that the resulting operation is legal. int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode()); if (!ISDOpcode) return false; return StressStoreExtract || TLI.isOperationLegalOrCustom( ISDOpcode, TLI.getValueType(DL, getTransitionType(), true)); } /// \brief Check whether or not \p Use can be combined /// with the transition. /// I.e., is it possible to do Use(Transition) => AnotherUse? bool canCombine(const Instruction *Use) { return isa(Use); } /// \brief Record \p ToBePromoted as part of the chain to be promoted. void enqueueForPromotion(Instruction *ToBePromoted) { InstsToBePromoted.push_back(ToBePromoted); } /// \brief Set the instruction that will be combined with the transition. void recordCombineInstruction(Instruction *ToBeCombined) { assert(canCombine(ToBeCombined) && "Unsupported instruction to combine"); CombineInst = ToBeCombined; } /// \brief Promote all the instructions enqueued for promotion if it is /// is profitable. /// \return True if the promotion happened, false otherwise. bool promote() { // Check if there is something to promote. // Right now, if we do not have anything to combine with, // we assume the promotion is not profitable. if (InstsToBePromoted.empty() || !CombineInst) return false; // Check cost. if (!StressStoreExtract && !isProfitableToPromote()) return false; // Promote. for (auto &ToBePromoted : InstsToBePromoted) promoteImpl(ToBePromoted); InstsToBePromoted.clear(); return true; } }; } // End of anonymous namespace. void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { // At this point, we know that all the operands of ToBePromoted but Def // can be statically promoted. // For Def, we need to use its parameter in ToBePromoted: // b = ToBePromoted ty1 a // Def = Transition ty1 b to ty2 // Move the transition down. // 1. Replace all uses of the promoted operation by the transition. // = ... b => = ... Def. assert(ToBePromoted->getType() == Transition->getType() && "The type of the result of the transition does not match " "the final type"); ToBePromoted->replaceAllUsesWith(Transition); // 2. Update the type of the uses. // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def. Type *TransitionTy = getTransitionType(); ToBePromoted->mutateType(TransitionTy); // 3. Update all the operands of the promoted operation with promoted // operands. // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a. for (Use &U : ToBePromoted->operands()) { Value *Val = U.get(); Value *NewVal = nullptr; if (Val == Transition) NewVal = Transition->getOperand(getTransitionOriginalValueIdx()); else if (isa(Val) || isa(Val) || isa(Val)) { // Use a splat constant if it is not safe to use undef. NewVal = getConstantVector( cast(Val), isa(Val) || canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); } else llvm_unreachable("Did you modified shouldPromote and forgot to update " "this?"); ToBePromoted->setOperand(U.getOperandNo(), NewVal); } Transition->removeFromParent(); Transition->insertAfter(ToBePromoted); Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); } /// Some targets can do store(extractelement) with one instruction. /// Try to push the extractelement towards the stores when the target /// has this feature and this is profitable. bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { unsigned CombineCost = UINT_MAX; if (DisableStoreExtract || !TLI || (!StressStoreExtract && !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(), Inst->getOperand(1), CombineCost))) return false; // At this point we know that Inst is a vector to scalar transition. // Try to move it down the def-use chain, until: // - We can combine the transition with its single use // => we got rid of the transition. // - We escape the current basic block // => we would need to check that we are moving it at a cheaper place and // we do not do that for now. BasicBlock *Parent = Inst->getParent(); DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost); // If the transition has more than one use, assume this is not going to be // beneficial. while (Inst->hasOneUse()) { Instruction *ToBePromoted = cast(*Inst->user_begin()); DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n'); if (ToBePromoted->getParent() != Parent) { DEBUG(dbgs() << "Instruction to promote is in a different block (" << ToBePromoted->getParent()->getName() << ") than the transition (" << Parent->getName() << ").\n"); return false; } if (VPH.canCombine(ToBePromoted)) { DEBUG(dbgs() << "Assume " << *Inst << '\n' << "will be combined with: " << *ToBePromoted << '\n'); VPH.recordCombineInstruction(ToBePromoted); bool Changed = VPH.promote(); NumStoreExtractExposed += Changed; return Changed; } DEBUG(dbgs() << "Try promoting.\n"); if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted)) return false; DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n"); VPH.enqueueForPromotion(ToBePromoted); Inst = ToBePromoted; } return false; } bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. if (InsertedInsts.count(I)) return false; if (PHINode *P = dyn_cast(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; return true; } return false; } if (CastInst *CI = dyn_cast(I)) { // If the source of the cast is a constant, then this should have // already been constant folded. The only reason NOT to constant fold // it is if something (e.g. LSR) was careful to place the constant // evaluation in a block other than then one that uses it (e.g. to hoist // the address of globals out of a loop). If this is the case, we don't // want to forward-subst the cast. if (isa(CI->getOperand(0))) return false; if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL)) return true; if (isa(I) || isa(I)) { /// Sink a zext or sext into its user blocks if the target type doesn't /// fit in one register if (TLI && TLI->getTypeAction(CI->getContext(), TLI->getValueType(*DL, CI->getType())) == TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { bool MadeChange = moveExtToFormExtLoad(I); return MadeChange | optimizeExtUses(I); } } return false; } if (CmpInst *CI = dyn_cast(I)) if (!TLI || !TLI->hasMultipleConditionRegisters()) return OptimizeCmpExpression(CI); if (LoadInst *LI = dyn_cast(I)) { stripInvariantGroupMetadata(*LI); if (TLI) { bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); return Modified; } return false; } if (StoreInst *SI = dyn_cast(I)) { stripInvariantGroupMetadata(*SI); if (TLI) { unsigned AS = SI->getPointerAddressSpace(); return optimizeMemoryInst(I, SI->getOperand(1), SI->getOperand(0)->getType(), AS); } return false; } BinaryOperator *BinOp = dyn_cast(I); if (BinOp && (BinOp->getOpcode() == Instruction::AShr || BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast(BinOp->getOperand(1)); if (TLI && CI && TLI->hasExtractBitsInsn()) return OptimizeExtractBits(BinOp, CI, *TLI, *DL); return false; } if (GetElementPtrInst *GEPI = dyn_cast(I)) { if (GEPI->hasAllZeroIndices()) { /// The GEP operand must be a pointer, so must its result -> BitCast Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), GEPI->getName(), GEPI); GEPI->replaceAllUsesWith(NC); GEPI->eraseFromParent(); ++NumGEPsElim; optimizeInst(NC, ModifiedDT); return true; } return false; } if (CallInst *CI = dyn_cast(I)) return optimizeCallInst(CI, ModifiedDT); if (SelectInst *SI = dyn_cast(I)) return optimizeSelectInst(SI); if (ShuffleVectorInst *SVI = dyn_cast(I)) return optimizeShuffleVectorInst(SVI); if (auto *Switch = dyn_cast(I)) return optimizeSwitchInst(Switch); if (isa(I)) return optimizeExtractElementInst(I); return false; } +/// Given an OR instruction, check to see if this is a bitreverse +/// idiom. If so, insert the new intrinsic and return true. +static bool makeBitReverse(Instruction &I, const DataLayout &DL, + const TargetLowering &TLI) { + if (!I.getType()->isIntegerTy() || + !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, + TLI.getValueType(DL, I.getType(), true))) + return false; + + SmallVector Insts; + if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts)) + return false; + Instruction *LastInst = Insts.back(); + I.replaceAllUsesWith(LastInst); + RecursivelyDeleteTriviallyDeadInstructions(&I); + return true; +} + // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; CurInstIterator = BB.begin(); while (CurInstIterator != BB.end()) { MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); if (ModifiedDT) return true; } - MadeChange |= dupRetToEnableTailCallOpts(&BB); + bool MadeBitReverse = true; + while (TLI && MadeBitReverse) { + MadeBitReverse = false; + for (auto &I : reverse(BB)) { + if (makeBitReverse(I, *DL, *TLI)) { + MadeBitReverse = MadeChange = true; + break; + } + } + } + MadeChange |= dupRetToEnableTailCallOpts(&BB); + return MadeChange; } // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. bool CodeGenPrepare::placeDbgValues(Function &F) { bool MadeChange = false; for (BasicBlock &BB : F) { Instruction *PrevNonDbgInst = nullptr; for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { Instruction *Insn = &*BI++; DbgValueInst *DVI = dyn_cast(Insn); // Leave dbg.values that refer to an alloca alone. These // instrinsics describe the address of a variable (= the alloca) // being taken. They should not be moved next to the alloca // (and to the beginning of the scope), but rather stay close to // where said address is used. if (!DVI || (DVI->getValue() && isa(DVI->getValue()))) { PrevNonDbgInst = Insn; continue; } Instruction *VI = dyn_cast_or_null(DVI->getValue()); if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { // If VI is a phi in a block with an EHPad terminator, we can't insert // after it. if (isa(VI) && VI->getParent()->getTerminator()->isEHPad()) continue; DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); DVI->removeFromParent(); if (isa(VI)) DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); else DVI->insertAfter(VI); MadeChange = true; ++NumDbgValueMoved; } } } return MadeChange; } // If there is a sequence that branches based on comparing a single bit // against zero that can be combined into a single instruction, and the // target supports folding these into a single instruction, sink the // mask and compare into the branch uses. Do this before OptimizeBlock -> // OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being // searched for. bool CodeGenPrepare::sinkAndCmp(Function &F) { if (!EnableAndCmpSinking) return false; if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) return false; bool MadeChange = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { BasicBlock *BB = &*I++; // Does this BB end with the following? // %andVal = and %val, #single-bit-set // %icmpVal = icmp %andResult, 0 // br i1 %cmpVal label %dest1, label %dest2" BranchInst *Brcc = dyn_cast(BB->getTerminator()); if (!Brcc || !Brcc->isConditional()) continue; ICmpInst *Cmp = dyn_cast(Brcc->getOperand(0)); if (!Cmp || Cmp->getParent() != BB) continue; ConstantInt *Zero = dyn_cast(Cmp->getOperand(1)); if (!Zero || !Zero->isZero()) continue; Instruction *And = dyn_cast(Cmp->getOperand(0)); if (!And || And->getOpcode() != Instruction::And || And->getParent() != BB) continue; ConstantInt* Mask = dyn_cast(And->getOperand(1)); if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) continue; DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB->dump()); // Push the "and; icmp" for any users that are conditional branches. // Since there can only be one branch use per BB, we don't need to keep // track of which BBs we insert into. for (Value::use_iterator UI = Cmp->use_begin(), E = Cmp->use_end(); UI != E; ) { Use &TheUse = *UI; // Find brcc use. BranchInst *BrccUser = dyn_cast(*UI); ++UI; if (!BrccUser || !BrccUser->isConditional()) continue; BasicBlock *UserBB = BrccUser->getParent(); if (UserBB == BB) continue; DEBUG(dbgs() << "found Brcc use\n"); // Sink the "and; icmp" to use. MadeChange = true; BinaryOperator *NewAnd = BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", BrccUser); CmpInst *NewCmp = CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, "", BrccUser); TheUse = NewCmp; ++NumAndCmpsMoved; DEBUG(BrccUser->getParent()->dump()); } } return MadeChange; } /// \brief Retrieve the probabilities of a conditional branch. Returns true on /// success, or returns false if no or invalid metadata was found. static bool extractBranchMetadata(BranchInst *BI, uint64_t &ProbTrue, uint64_t &ProbFalse) { assert(BI->isConditional() && "Looking for probabilities on unconditional branch?"); auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof); if (!ProfileData || ProfileData->getNumOperands() != 3) return false; const auto *CITrue = mdconst::dyn_extract(ProfileData->getOperand(1)); const auto *CIFalse = mdconst::dyn_extract(ProfileData->getOperand(2)); if (!CITrue || !CIFalse) return false; ProbTrue = CITrue->getValue().getZExtValue(); ProbFalse = CIFalse->getValue().getZExtValue(); return true; } /// \brief Scale down both weights to fit into uint32_t. static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; uint32_t Scale = (NewMax / UINT32_MAX) + 1; NewTrue = NewTrue / Scale; NewFalse = NewFalse / Scale; } /// \brief Some targets prefer to split a conditional branch like: /// \code /// %0 = icmp ne i32 %a, 0 /// %1 = icmp ne i32 %b, 0 /// %or.cond = or i1 %0, %1 /// br i1 %or.cond, label %TrueBB, label %FalseBB /// \endcode /// into multiple branch instructions like: /// \code /// bb1: /// %0 = icmp ne i32 %a, 0 /// br i1 %0, label %TrueBB, label %bb2 /// bb2: /// %1 = icmp ne i32 %b, 0 /// br i1 %1, label %TrueBB, label %FalseBB /// \endcode /// This usually allows instruction selection to do even further optimizations /// and combine the compare with the branch instruction. Currently this is /// applied for targets which have "cheap" jump instructions. /// /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. /// bool CodeGenPrepare::splitBranchCondition(Function &F) { if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive()) return false; bool MadeChange = false; for (auto &BB : F) { // Does this BB end with the following? // %cond1 = icmp|fcmp|binary instruction ... // %cond2 = icmp|fcmp|binary instruction ... // %cond.or = or|and i1 %cond1, cond2 // br i1 %cond.or label %dest1, label %dest2" BinaryOperator *LogicOp; BasicBlock *TBB, *FBB; if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) continue; auto *Br1 = cast(BB.getTerminator()); if (Br1->getMetadata(LLVMContext::MD_unpredictable)) continue; unsigned Opc; Value *Cond1, *Cond2; if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), m_OneUse(m_Value(Cond2))))) Opc = Instruction::And; else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)), m_OneUse(m_Value(Cond2))))) Opc = Instruction::Or; else continue; if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) || !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) ) continue; DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump()); // Create a new BB. auto *InsertBefore = std::next(Function::iterator(BB)) .getNodePtrUnchecked(); auto TmpBB = BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split", BB.getParent(), InsertBefore); // Update original basic block by using the first condition directly by the // branch instruction and removing the no longer needed and/or instruction. Br1->setCondition(Cond1); LogicOp->eraseFromParent(); // Depending on the conditon we have to either replace the true or the false // successor of the original branch instruction. if (Opc == Instruction::And) Br1->setSuccessor(0, TmpBB); else Br1->setSuccessor(1, TmpBB); // Fill in the new basic block. auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB); if (auto *I = dyn_cast(Cond2)) { I->removeFromParent(); I->insertBefore(Br2); } // Update PHI nodes in both successors. The original BB needs to be // replaced in one succesor's PHI nodes, because the branch comes now from // the newly generated BB (NewBB). In the other successor we need to add one // incoming edge to the PHI nodes, because both branch instructions target // now the same successor. Depending on the original branch condition // (and/or) we have to swap the successors (TrueDest, FalseDest), so that // we perfrom the correct update for the PHI nodes. // This doesn't change the successor order of the just created branch // instruction (or any other instruction). if (Opc == Instruction::Or) std::swap(TBB, FBB); // Replace the old BB with the new BB. for (auto &I : *TBB) { PHINode *PN = dyn_cast(&I); if (!PN) break; int i; while ((i = PN->getBasicBlockIndex(&BB)) >= 0) PN->setIncomingBlock(i, TmpBB); } // Add another incoming edge form the new BB. for (auto &I : *FBB) { PHINode *PN = dyn_cast(&I); if (!PN) break; auto *Val = PN->getIncomingValueForBlock(&BB); PN->addIncoming(Val, TmpBB); } // Update the branch weights (from SelectionDAGBuilder:: // FindMergedConditions). if (Opc == Instruction::Or) { // Codegen X | Y as: // BB1: // jmp_if_X TBB // jmp TmpBB // TmpBB: // jmp_if_Y TBB // jmp FBB // // We have flexibility in setting Prob for BB1 and Prob for NewBB. // The requirement is that // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) // = TrueProb for orignal BB. // Assuming the orignal weights are A and B, one choice is to set BB1's // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice // assumes that // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. // Another choice is to assume TrueProb for BB1 equals to TrueProb for // TmpBB, but the math is more complicated. uint64_t TrueWeight, FalseWeight; if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) { uint64_t NewTrueWeight = TrueWeight; uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight; scaleWeights(NewTrueWeight, NewFalseWeight); Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) .createBranchWeights(TrueWeight, FalseWeight)); NewTrueWeight = TrueWeight; NewFalseWeight = 2 * FalseWeight; scaleWeights(NewTrueWeight, NewFalseWeight); Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) .createBranchWeights(TrueWeight, FalseWeight)); } } else { // Codegen X & Y as: // BB1: // jmp_if_X TmpBB // jmp FBB // TmpBB: // jmp_if_Y TBB // jmp FBB // // This requires creation of TmpBB after CurBB. // We have flexibility in setting Prob for BB1 and Prob for TmpBB. // The requirement is that // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) // = FalseProb for orignal BB. // Assuming the orignal weights are A and B, one choice is to set BB1's // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice // assumes that // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. uint64_t TrueWeight, FalseWeight; if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) { uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight; uint64_t NewFalseWeight = FalseWeight; scaleWeights(NewTrueWeight, NewFalseWeight); Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) .createBranchWeights(TrueWeight, FalseWeight)); NewTrueWeight = 2 * TrueWeight; NewFalseWeight = FalseWeight; scaleWeights(NewTrueWeight, NewFalseWeight); Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) .createBranchWeights(TrueWeight, FalseWeight)); } } // Note: No point in getting fancy here, since the DT info is never // available to CodeGenPrepare. ModifiedDT = true; MadeChange = true; DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); TmpBB->dump()); } return MadeChange; } void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) { if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group)) I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID()); } Index: projects/clang380-import/contrib/llvm/lib/CodeGen/MachineFunction.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/CodeGen/MachineFunction.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/CodeGen/MachineFunction.cpp (revision 294609) @@ -1,970 +1,970 @@ //===-- MachineFunction.cpp -----------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Collect native machine code information for a function. This allows // target-specific information about the generated code to be stored with each // function. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineFunction.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionInitializer.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "codegen" static cl::opt AlignAllFunctions("align-all-functions", cl::desc("Force the alignment of all functions."), cl::init(0), cl::Hidden); void MachineFunctionInitializer::anchor() {} //===----------------------------------------------------------------------===// // MachineFunction implementation //===----------------------------------------------------------------------===// // Out-of-line virtual method. MachineFunctionInfo::~MachineFunctionInfo() {} void ilist_traits::deleteNode(MachineBasicBlock *MBB) { MBB->getParent()->DeleteMachineBasicBlock(MBB); } MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM, unsigned FunctionNum, MachineModuleInfo &mmi) : Fn(F), Target(TM), STI(TM.getSubtargetImpl(*F)), Ctx(mmi.getContext()), MMI(mmi) { if (STI->getRegisterInfo()) RegInfo = new (Allocator) MachineRegisterInfo(this); else RegInfo = nullptr; MFInfo = nullptr; FrameInfo = new (Allocator) MachineFrameInfo(STI->getFrameLowering()->getStackAlignment(), STI->getFrameLowering()->isStackRealignable(), !F->hasFnAttribute("no-realign-stack")); if (Fn->hasFnAttribute(Attribute::StackAlignment)) FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment()); ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn. // FIXME: Use Function::optForSize(). if (!Fn->hasFnAttribute(Attribute::OptimizeForSize)) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); if (AlignAllFunctions) Alignment = AlignAllFunctions; FunctionNumber = FunctionNum; JumpTableInfo = nullptr; if (isFuncletEHPersonality(classifyEHPersonality( F->hasPersonalityFn() ? F->getPersonalityFn() : nullptr))) { WinEHInfo = new (Allocator) WinEHFuncInfo(); } assert(TM.isCompatibleDataLayout(getDataLayout()) && "Can't create a MachineFunction using a Module with a " "Target-incompatible DataLayout attached\n"); PSVManager = llvm::make_unique(); } MachineFunction::~MachineFunction() { // Don't call destructors on MachineInstr and MachineOperand. All of their // memory comes from the BumpPtrAllocator which is about to be purged. // // Do call MachineBasicBlock destructors, it contains std::vectors. for (iterator I = begin(), E = end(); I != E; I = BasicBlocks.erase(I)) I->Insts.clearAndLeakNodesUnsafely(); InstructionRecycler.clear(Allocator); OperandRecycler.clear(Allocator); BasicBlockRecycler.clear(Allocator); if (RegInfo) { RegInfo->~MachineRegisterInfo(); Allocator.Deallocate(RegInfo); } if (MFInfo) { MFInfo->~MachineFunctionInfo(); Allocator.Deallocate(MFInfo); } FrameInfo->~MachineFrameInfo(); Allocator.Deallocate(FrameInfo); ConstantPool->~MachineConstantPool(); Allocator.Deallocate(ConstantPool); if (JumpTableInfo) { JumpTableInfo->~MachineJumpTableInfo(); Allocator.Deallocate(JumpTableInfo); } if (WinEHInfo) { WinEHInfo->~WinEHFuncInfo(); Allocator.Deallocate(WinEHInfo); } } const DataLayout &MachineFunction::getDataLayout() const { return Fn->getParent()->getDataLayout(); } /// Get the JumpTableInfo for this function. /// If it does not already exist, allocate one. MachineJumpTableInfo *MachineFunction:: getOrCreateJumpTableInfo(unsigned EntryKind) { if (JumpTableInfo) return JumpTableInfo; JumpTableInfo = new (Allocator) MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind); return JumpTableInfo; } /// Should we be emitting segmented stack stuff for the function -bool MachineFunction::shouldSplitStack() { +bool MachineFunction::shouldSplitStack() const { return getFunction()->hasFnAttribute("split-stack"); } /// This discards all of the MachineBasicBlock numbers and recomputes them. /// This guarantees that the MBB numbers are sequential, dense, and match the /// ordering of the blocks within the function. If a specific MachineBasicBlock /// is specified, only that block and those after it are renumbered. void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { if (empty()) { MBBNumbering.clear(); return; } MachineFunction::iterator MBBI, E = end(); if (MBB == nullptr) MBBI = begin(); else MBBI = MBB->getIterator(); // Figure out the block number this should have. unsigned BlockNo = 0; if (MBBI != begin()) BlockNo = std::prev(MBBI)->getNumber() + 1; for (; MBBI != E; ++MBBI, ++BlockNo) { if (MBBI->getNumber() != (int)BlockNo) { // Remove use of the old number. if (MBBI->getNumber() != -1) { assert(MBBNumbering[MBBI->getNumber()] == &*MBBI && "MBB number mismatch!"); MBBNumbering[MBBI->getNumber()] = nullptr; } // If BlockNo is already taken, set that block's number to -1. if (MBBNumbering[BlockNo]) MBBNumbering[BlockNo]->setNumber(-1); MBBNumbering[BlockNo] = &*MBBI; MBBI->setNumber(BlockNo); } } // Okay, all the blocks are renumbered. If we have compactified the block // numbering, shrink MBBNumbering now. assert(BlockNo <= MBBNumbering.size() && "Mismatch!"); MBBNumbering.resize(BlockNo); } /// Allocate a new MachineInstr. Use this instead of `new MachineInstr'. MachineInstr * MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID, DebugLoc DL, bool NoImp) { return new (InstructionRecycler.Allocate(Allocator)) MachineInstr(*this, MCID, DL, NoImp); } /// Create a new MachineInstr which is a copy of the 'Orig' instruction, /// identical in all ways except the instruction has no parent, prev, or next. MachineInstr * MachineFunction::CloneMachineInstr(const MachineInstr *Orig) { return new (InstructionRecycler.Allocate(Allocator)) MachineInstr(*this, *Orig); } /// Delete the given MachineInstr. /// /// This function also serves as the MachineInstr destructor - the real /// ~MachineInstr() destructor must be empty. void MachineFunction::DeleteMachineInstr(MachineInstr *MI) { // Strip it for parts. The operand array and the MI object itself are // independently recyclable. if (MI->Operands) deallocateOperandArray(MI->CapOperands, MI->Operands); // Don't call ~MachineInstr() which must be trivial anyway because // ~MachineFunction drops whole lists of MachineInstrs wihout calling their // destructors. InstructionRecycler.Deallocate(Allocator, MI); } /// Allocate a new MachineBasicBlock. Use this instead of /// `new MachineBasicBlock'. MachineBasicBlock * MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) { return new (BasicBlockRecycler.Allocate(Allocator)) MachineBasicBlock(*this, bb); } /// Delete the given MachineBasicBlock. void MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) { assert(MBB->getParent() == this && "MBB parent mismatch!"); MBB->~MachineBasicBlock(); BasicBlockRecycler.Deallocate(Allocator, MBB); } MachineMemOperand * MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) { return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges); } MachineMemOperand * MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO, int64_t Offset, uint64_t Size) { if (MMO->getValue()) return new (Allocator) MachineMemOperand(MachinePointerInfo(MMO->getValue(), MMO->getOffset()+Offset), MMO->getFlags(), Size, MMO->getBaseAlignment()); return new (Allocator) MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset()+Offset), MMO->getFlags(), Size, MMO->getBaseAlignment()); } MachineInstr::mmo_iterator MachineFunction::allocateMemRefsArray(unsigned long Num) { return Allocator.Allocate(Num); } std::pair MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin, MachineInstr::mmo_iterator End) { // Count the number of load mem refs. unsigned Num = 0; for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) if ((*I)->isLoad()) ++Num; // Allocate a new array and populate it with the load information. MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num); unsigned Index = 0; for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) { if ((*I)->isLoad()) { if (!(*I)->isStore()) // Reuse the MMO. Result[Index] = *I; else { // Clone the MMO and unset the store flag. MachineMemOperand *JustLoad = getMachineMemOperand((*I)->getPointerInfo(), (*I)->getFlags() & ~MachineMemOperand::MOStore, (*I)->getSize(), (*I)->getBaseAlignment(), (*I)->getAAInfo()); Result[Index] = JustLoad; } ++Index; } } return std::make_pair(Result, Result + Num); } std::pair MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin, MachineInstr::mmo_iterator End) { // Count the number of load mem refs. unsigned Num = 0; for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) if ((*I)->isStore()) ++Num; // Allocate a new array and populate it with the store information. MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num); unsigned Index = 0; for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) { if ((*I)->isStore()) { if (!(*I)->isLoad()) // Reuse the MMO. Result[Index] = *I; else { // Clone the MMO and unset the load flag. MachineMemOperand *JustStore = getMachineMemOperand((*I)->getPointerInfo(), (*I)->getFlags() & ~MachineMemOperand::MOLoad, (*I)->getSize(), (*I)->getBaseAlignment(), (*I)->getAAInfo()); Result[Index] = JustStore; } ++Index; } } return std::make_pair(Result, Result + Num); } const char *MachineFunction::createExternalSymbolName(StringRef Name) { char *Dest = Allocator.Allocate(Name.size() + 1); std::copy(Name.begin(), Name.end(), Dest); Dest[Name.size()] = 0; return Dest; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void MachineFunction::dump() const { print(dbgs()); } #endif StringRef MachineFunction::getName() const { assert(getFunction() && "No function!"); return getFunction()->getName(); } void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const { OS << "# Machine code for function " << getName() << ": "; if (RegInfo) { OS << (RegInfo->isSSA() ? "SSA" : "Post SSA"); if (!RegInfo->tracksLiveness()) OS << ", not tracking liveness"; } OS << '\n'; // Print Frame Information FrameInfo->print(*this, OS); // Print JumpTable Information if (JumpTableInfo) JumpTableInfo->print(OS); // Print Constant Pool ConstantPool->print(OS); const TargetRegisterInfo *TRI = getSubtarget().getRegisterInfo(); if (RegInfo && !RegInfo->livein_empty()) { OS << "Function Live Ins: "; for (MachineRegisterInfo::livein_iterator I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) { OS << PrintReg(I->first, TRI); if (I->second) OS << " in " << PrintReg(I->second, TRI); if (std::next(I) != E) OS << ", "; } OS << '\n'; } ModuleSlotTracker MST(getFunction()->getParent()); MST.incorporateFunction(*getFunction()); for (const auto &BB : *this) { OS << '\n'; BB.print(OS, MST, Indexes); } OS << "\n# End machine code for function " << getName() << ".\n\n"; } namespace llvm { template<> struct DOTGraphTraits : public DefaultDOTGraphTraits { DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} static std::string getGraphName(const MachineFunction *F) { return ("CFG for '" + F->getName() + "' function").str(); } std::string getNodeLabel(const MachineBasicBlock *Node, const MachineFunction *Graph) { std::string OutStr; { raw_string_ostream OSS(OutStr); if (isSimple()) { OSS << "BB#" << Node->getNumber(); if (const BasicBlock *BB = Node->getBasicBlock()) OSS << ": " << BB->getName(); } else Node->print(OSS); } if (OutStr[0] == '\n') OutStr.erase(OutStr.begin()); // Process string output to make it nicer... for (unsigned i = 0; i != OutStr.length(); ++i) if (OutStr[i] == '\n') { // Left justify OutStr[i] = '\\'; OutStr.insert(OutStr.begin()+i+1, 'l'); } return OutStr; } }; } void MachineFunction::viewCFG() const { #ifndef NDEBUG ViewGraph(this, "mf" + getName()); #else errs() << "MachineFunction::viewCFG is only available in debug builds on " << "systems with Graphviz or gv!\n"; #endif // NDEBUG } void MachineFunction::viewCFGOnly() const { #ifndef NDEBUG ViewGraph(this, "mf" + getName(), true); #else errs() << "MachineFunction::viewCFGOnly is only available in debug builds on " << "systems with Graphviz or gv!\n"; #endif // NDEBUG } /// Add the specified physical register as a live-in value and /// create a corresponding virtual register for it. unsigned MachineFunction::addLiveIn(unsigned PReg, const TargetRegisterClass *RC) { MachineRegisterInfo &MRI = getRegInfo(); unsigned VReg = MRI.getLiveInVirtReg(PReg); if (VReg) { const TargetRegisterClass *VRegRC = MRI.getRegClass(VReg); (void)VRegRC; // A physical register can be added several times. // Between two calls, the register class of the related virtual register // may have been constrained to match some operation constraints. // In that case, check that the current register class includes the // physical register and is a sub class of the specified RC. assert((VRegRC == RC || (VRegRC->contains(PReg) && RC->hasSubClassEq(VRegRC))) && "Register class mismatch!"); return VReg; } VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(PReg, VReg); return VReg; } /// Return the MCSymbol for the specified non-empty jump table. /// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a /// normal 'L' label is returned. MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, bool isLinkerPrivate) const { const DataLayout &DL = getDataLayout(); assert(JumpTableInfo && "No jump tables"); assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!"); const char *Prefix = isLinkerPrivate ? DL.getLinkerPrivateGlobalPrefix() : DL.getPrivateGlobalPrefix(); SmallString<60> Name; raw_svector_ostream(Name) << Prefix << "JTI" << getFunctionNumber() << '_' << JTI; return Ctx.getOrCreateSymbol(Name); } /// Return a function-local symbol to represent the PIC base. MCSymbol *MachineFunction::getPICBaseSymbol() const { const DataLayout &DL = getDataLayout(); return Ctx.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "$pb"); } //===----------------------------------------------------------------------===// // MachineFrameInfo implementation //===----------------------------------------------------------------------===// /// Make sure the function is at least Align bytes aligned. void MachineFrameInfo::ensureMaxAlignment(unsigned Align) { if (!StackRealignable || !RealignOption) assert(Align <= StackAlignment && "For targets without stack realignment, Align is out of limit!"); if (MaxAlignment < Align) MaxAlignment = Align; } /// Clamp the alignment if requested and emit a warning. static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align, unsigned StackAlign) { if (!ShouldClamp || Align <= StackAlign) return Align; DEBUG(dbgs() << "Warning: requested alignment " << Align << " exceeds the stack alignment " << StackAlign << " when stack realignment is off" << '\n'); return StackAlign; } /// Create a new statically sized stack object, returning a nonnegative /// identifier to represent it. int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS, const AllocaInst *Alloca) { assert(Size != 0 && "Cannot allocate zero size stack objects!"); Alignment = clampStackAlignment(!StackRealignable || !RealignOption, Alignment, StackAlignment); Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca, !isSS)); int Index = (int)Objects.size() - NumFixedObjects - 1; assert(Index >= 0 && "Bad frame index!"); ensureMaxAlignment(Alignment); return Index; } /// Create a new statically sized stack object that represents a spill slot, /// returning a nonnegative identifier to represent it. int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, unsigned Alignment) { Alignment = clampStackAlignment(!StackRealignable || !RealignOption, Alignment, StackAlignment); CreateStackObject(Size, Alignment, true); int Index = (int)Objects.size() - NumFixedObjects - 1; ensureMaxAlignment(Alignment); return Index; } /// Notify the MachineFrameInfo object that a variable sized object has been /// created. This must be created whenever a variable sized object is created, /// whether or not the index returned is actually used. int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, const AllocaInst *Alloca) { HasVarSizedObjects = true; Alignment = clampStackAlignment(!StackRealignable || !RealignOption, Alignment, StackAlignment); Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true)); ensureMaxAlignment(Alignment); return (int)Objects.size()-NumFixedObjects-1; } /// Create a new object at a fixed location on the stack. /// All fixed objects should be created before other objects are created for /// efficiency. By default, fixed objects are immutable. This returns an /// index with a negative value. int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable, bool isAliased) { assert(Size != 0 && "Cannot allocate zero size fixed stack objects!"); // The alignment of the frame index can be determined from its offset from // the incoming frame position. If the frame object is at offset 32 and // the stack is guaranteed to be 16-byte aligned, then we know that the // object is 16-byte aligned. unsigned Align = MinAlign(SPOffset, StackAlignment); Align = clampStackAlignment(!StackRealignable || !RealignOption, Align, StackAlignment); Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, /*isSS*/ false, /*Alloca*/ nullptr, isAliased)); return -++NumFixedObjects; } /// Create a spill slot at a fixed location on the stack. /// Returns an index with a negative value. int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, int64_t SPOffset) { unsigned Align = MinAlign(SPOffset, StackAlignment); Align = clampStackAlignment(!StackRealignable || !RealignOption, Align, StackAlignment); Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, /*Immutable*/ true, /*isSS*/ true, /*Alloca*/ nullptr, /*isAliased*/ false)); return -++NumFixedObjects; } BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); BitVector BV(TRI->getNumRegs()); // Before CSI is calculated, no registers are considered pristine. They can be // freely used and PEI will make sure they are saved. if (!isCalleeSavedInfoValid()) return BV; for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) BV.set(*CSR); // Saved CSRs are not pristine. for (auto &I : getCalleeSavedInfo()) for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) BV.reset(*S); return BV; } unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); unsigned MaxAlign = getMaxAlignment(); int Offset = 0; // This code is very, very similar to PEI::calculateFrameObjectOffsets(). // It really should be refactored to share code. Until then, changes // should keep in mind that there's tight coupling between the two. for (int i = getObjectIndexBegin(); i != 0; ++i) { int FixedOff = -getObjectOffset(i); if (FixedOff > Offset) Offset = FixedOff; } for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) { if (isDeadObjectIndex(i)) continue; Offset += getObjectSize(i); unsigned Align = getObjectAlignment(i); // Adjust to alignment boundary Offset = (Offset+Align-1)/Align*Align; MaxAlign = std::max(Align, MaxAlign); } if (adjustsStack() && TFI->hasReservedCallFrame(MF)) Offset += getMaxCallFrameSize(); // Round up the size to a multiple of the alignment. If the function has // any calls or alloca's, align to the target's StackAlignment value to // ensure that the callee's frame or the alloca data is suitably aligned; // otherwise, for leaf functions, align to the TransientStackAlignment // value. unsigned StackAlign; if (adjustsStack() || hasVarSizedObjects() || (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0)) StackAlign = TFI->getStackAlignment(); else StackAlign = TFI->getTransientStackAlignment(); // If the frame pointer is eliminated, all frame offsets will be relative to // SP not FP. Align to MaxAlign so this works. StackAlign = std::max(StackAlign, MaxAlign); unsigned AlignMask = StackAlign - 1; Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); return (unsigned)Offset; } void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ if (Objects.empty()) return; const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering(); int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0); OS << "Frame Objects:\n"; for (unsigned i = 0, e = Objects.size(); i != e; ++i) { const StackObject &SO = Objects[i]; OS << " fi#" << (int)(i-NumFixedObjects) << ": "; if (SO.Size == ~0ULL) { OS << "dead\n"; continue; } if (SO.Size == 0) OS << "variable sized"; else OS << "size=" << SO.Size; OS << ", align=" << SO.Alignment; if (i < NumFixedObjects) OS << ", fixed"; if (i < NumFixedObjects || SO.SPOffset != -1) { int64_t Off = SO.SPOffset - ValOffset; OS << ", at location [SP"; if (Off > 0) OS << "+" << Off; else if (Off < 0) OS << Off; OS << "]"; } OS << "\n"; } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void MachineFrameInfo::dump(const MachineFunction &MF) const { print(MF, dbgs()); } #endif //===----------------------------------------------------------------------===// // MachineJumpTableInfo implementation //===----------------------------------------------------------------------===// /// Return the size of each entry in the jump table. unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const { // The size of a jump table entry is 4 bytes unless the entry is just the // address of a block, in which case it is the pointer size. switch (getEntryKind()) { case MachineJumpTableInfo::EK_BlockAddress: return TD.getPointerSize(); case MachineJumpTableInfo::EK_GPRel64BlockAddress: return 8; case MachineJumpTableInfo::EK_GPRel32BlockAddress: case MachineJumpTableInfo::EK_LabelDifference32: case MachineJumpTableInfo::EK_Custom32: return 4; case MachineJumpTableInfo::EK_Inline: return 0; } llvm_unreachable("Unknown jump table encoding!"); } /// Return the alignment of each entry in the jump table. unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const { // The alignment of a jump table entry is the alignment of int32 unless the // entry is just the address of a block, in which case it is the pointer // alignment. switch (getEntryKind()) { case MachineJumpTableInfo::EK_BlockAddress: return TD.getPointerABIAlignment(); case MachineJumpTableInfo::EK_GPRel64BlockAddress: return TD.getABIIntegerTypeAlignment(64); case MachineJumpTableInfo::EK_GPRel32BlockAddress: case MachineJumpTableInfo::EK_LabelDifference32: case MachineJumpTableInfo::EK_Custom32: return TD.getABIIntegerTypeAlignment(32); case MachineJumpTableInfo::EK_Inline: return 1; } llvm_unreachable("Unknown jump table encoding!"); } /// Create a new jump table entry in the jump table info. unsigned MachineJumpTableInfo::createJumpTableIndex( const std::vector &DestBBs) { assert(!DestBBs.empty() && "Cannot create an empty jump table!"); JumpTables.push_back(MachineJumpTableEntry(DestBBs)); return JumpTables.size()-1; } /// If Old is the target of any jump tables, update the jump tables to branch /// to New instead. bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old, MachineBasicBlock *New) { assert(Old != New && "Not making a change?"); bool MadeChange = false; for (size_t i = 0, e = JumpTables.size(); i != e; ++i) ReplaceMBBInJumpTable(i, Old, New); return MadeChange; } /// If Old is a target of the jump tables, update the jump table to branch to /// New instead. bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx, MachineBasicBlock *Old, MachineBasicBlock *New) { assert(Old != New && "Not making a change?"); bool MadeChange = false; MachineJumpTableEntry &JTE = JumpTables[Idx]; for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j) if (JTE.MBBs[j] == Old) { JTE.MBBs[j] = New; MadeChange = true; } return MadeChange; } void MachineJumpTableInfo::print(raw_ostream &OS) const { if (JumpTables.empty()) return; OS << "Jump Tables:\n"; for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) { OS << " jt#" << i << ": "; for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j) OS << " BB#" << JumpTables[i].MBBs[j]->getNumber(); } OS << '\n'; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void MachineJumpTableInfo::dump() const { print(dbgs()); } #endif //===----------------------------------------------------------------------===// // MachineConstantPool implementation //===----------------------------------------------------------------------===// void MachineConstantPoolValue::anchor() { } Type *MachineConstantPoolEntry::getType() const { if (isMachineConstantPoolEntry()) return Val.MachineCPVal->getType(); return Val.ConstVal->getType(); } bool MachineConstantPoolEntry::needsRelocation() const { if (isMachineConstantPoolEntry()) return true; return Val.ConstVal->needsRelocation(); } SectionKind MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const { if (needsRelocation()) return SectionKind::getReadOnlyWithRel(); switch (DL->getTypeAllocSize(getType())) { case 4: return SectionKind::getMergeableConst4(); case 8: return SectionKind::getMergeableConst8(); case 16: return SectionKind::getMergeableConst16(); default: return SectionKind::getReadOnly(); } } MachineConstantPool::~MachineConstantPool() { for (unsigned i = 0, e = Constants.size(); i != e; ++i) if (Constants[i].isMachineConstantPoolEntry()) delete Constants[i].Val.MachineCPVal; for (DenseSet::iterator I = MachineCPVsSharingEntries.begin(), E = MachineCPVsSharingEntries.end(); I != E; ++I) delete *I; } /// Test whether the given two constants can be allocated the same constant pool /// entry. static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, const DataLayout &DL) { // Handle the trivial case quickly. if (A == B) return true; // If they have the same type but weren't the same constant, quickly // reject them. if (A->getType() == B->getType()) return false; // We can't handle structs or arrays. if (isa(A->getType()) || isa(A->getType()) || isa(B->getType()) || isa(B->getType())) return false; // For now, only support constants with the same size. uint64_t StoreSize = DL.getTypeStoreSize(A->getType()); if (StoreSize != DL.getTypeStoreSize(B->getType()) || StoreSize > 128) return false; Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8); // Try constant folding a bitcast of both instructions to an integer. If we // get two identical ConstantInt's, then we are good to share them. We use // the constant folding APIs to do this so that we get the benefit of // DataLayout. if (isa(A->getType())) A = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy, const_cast(A), DL); else if (A->getType() != IntTy) A = ConstantFoldInstOperands(Instruction::BitCast, IntTy, const_cast(A), DL); if (isa(B->getType())) B = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy, const_cast(B), DL); else if (B->getType() != IntTy) B = ConstantFoldInstOperands(Instruction::BitCast, IntTy, const_cast(B), DL); return A == B; } /// Create a new entry in the constant pool or return an existing one. /// User must specify the log2 of the minimum required alignment for the object. unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, unsigned Alignment) { assert(Alignment && "Alignment must be specified!"); if (Alignment > PoolAlignment) PoolAlignment = Alignment; // Check to see if we already have this constant. // // FIXME, this could be made much more efficient for large constant pools. for (unsigned i = 0, e = Constants.size(); i != e; ++i) if (!Constants[i].isMachineConstantPoolEntry() && CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, DL)) { if ((unsigned)Constants[i].getAlignment() < Alignment) Constants[i].Alignment = Alignment; return i; } Constants.push_back(MachineConstantPoolEntry(C, Alignment)); return Constants.size()-1; } unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V, unsigned Alignment) { assert(Alignment && "Alignment must be specified!"); if (Alignment > PoolAlignment) PoolAlignment = Alignment; // Check to see if we already have this constant. // // FIXME, this could be made much more efficient for large constant pools. int Idx = V->getExistingMachineCPValue(this, Alignment); if (Idx != -1) { MachineCPVsSharingEntries.insert(V); return (unsigned)Idx; } Constants.push_back(MachineConstantPoolEntry(V, Alignment)); return Constants.size()-1; } void MachineConstantPool::print(raw_ostream &OS) const { if (Constants.empty()) return; OS << "Constant Pool:\n"; for (unsigned i = 0, e = Constants.size(); i != e; ++i) { OS << " cp#" << i << ": "; if (Constants[i].isMachineConstantPoolEntry()) Constants[i].Val.MachineCPVal->print(OS); else Constants[i].Val.ConstVal->printAsOperand(OS, /*PrintType=*/false); OS << ", align=" << Constants[i].getAlignment(); OS << "\n"; } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void MachineConstantPool::dump() const { print(dbgs()); } #endif Index: projects/clang380-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 294609) @@ -1,7370 +1,7370 @@ //===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This implements the SelectionDAG class. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/SelectionDAG.h" #include "SDNodeDbgValue.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include #include #include using namespace llvm; /// makeVTList - Return an instance of the SDVTList struct initialized with the /// specified members. static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) { SDVTList Res = {VTs, NumVTs}; return Res; } // Default null implementations of the callbacks. void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {} void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {} //===----------------------------------------------------------------------===// // ConstantFPSDNode Class //===----------------------------------------------------------------------===// /// isExactlyValue - We don't rely on operator== working on double values, as /// it returns true for things that are clearly not equal, like -0.0 and 0.0. /// As such, this method can be used to do an exact bit-for-bit comparison of /// two floating point values. bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const { return getValueAPF().bitwiseIsEqual(V); } bool ConstantFPSDNode::isValueValidForType(EVT VT, const APFloat& Val) { assert(VT.isFloatingPoint() && "Can only convert between FP types"); // convert modifies in place, so make a copy. APFloat Val2 = APFloat(Val); bool losesInfo; (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } //===----------------------------------------------------------------------===// // ISD Namespace //===----------------------------------------------------------------------===// /// isBuildVectorAllOnes - Return true if the specified node is a /// BUILD_VECTOR where all of the elements are ~0 or undef. bool ISD::isBuildVectorAllOnes(const SDNode *N) { // Look through a bit convert. while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; unsigned i = 0, e = N->getNumOperands(); // Skip over all of the undef values. while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF) ++i; // Do not accept an all-undef vector. if (i == e) return false; // Do not accept build_vectors that aren't all constants or which have non-~0 // elements. We have to be a bit careful here, as the type of the constant // may not be the same as the type of the vector elements due to type // legalization (the elements are promoted to a legal type for the target and // a vector of a type may be legal when the base element type is not). // We only want to check enough bits to cover the vector elements, because // we care if the resultant vector is all ones, not whether the individual // constants are. SDValue NotZero = N->getOperand(i); unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits(); if (ConstantSDNode *CN = dyn_cast(NotZero)) { if (CN->getAPIntValue().countTrailingOnes() < EltSize) return false; } else if (ConstantFPSDNode *CFPN = dyn_cast(NotZero)) { if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize) return false; } else return false; // Okay, we have at least one ~0 value, check to see if the rest match or are // undefs. Even with the above element type twiddling, this should be OK, as // the same type legalization should have applied to all the elements. for (++i; i != e; ++i) if (N->getOperand(i) != NotZero && N->getOperand(i).getOpcode() != ISD::UNDEF) return false; return true; } /// isBuildVectorAllZeros - Return true if the specified node is a /// BUILD_VECTOR where all of the elements are 0 or undef. bool ISD::isBuildVectorAllZeros(const SDNode *N) { // Look through a bit convert. while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; bool IsAllUndef = true; for (const SDValue &Op : N->op_values()) { if (Op.getOpcode() == ISD::UNDEF) continue; IsAllUndef = false; // Do not accept build_vectors that aren't all constants or which have non-0 // elements. We have to be a bit careful here, as the type of the constant // may not be the same as the type of the vector elements due to type // legalization (the elements are promoted to a legal type for the target // and a vector of a type may be legal when the base element type is not). // We only want to check enough bits to cover the vector elements, because // we care if the resultant vector is all zeros, not whether the individual // constants are. unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits(); if (ConstantSDNode *CN = dyn_cast(Op)) { if (CN->getAPIntValue().countTrailingZeros() < EltSize) return false; } else if (ConstantFPSDNode *CFPN = dyn_cast(Op)) { if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize) return false; } else return false; } // Do not accept an all-undef vector. if (IsAllUndef) return false; return true; } /// \brief Return true if the specified node is a BUILD_VECTOR node of /// all ConstantSDNode or undef. bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) { if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (const SDValue &Op : N->op_values()) { if (Op.getOpcode() == ISD::UNDEF) continue; if (!isa(Op)) return false; } return true; } /// \brief Return true if the specified node is a BUILD_VECTOR node of /// all ConstantFPSDNode or undef. bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) { if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (const SDValue &Op : N->op_values()) { if (Op.getOpcode() == ISD::UNDEF) continue; if (!isa(Op)) return false; } return true; } /// allOperandsUndef - Return true if the node has at least one operand /// and all operands of the specified node are ISD::UNDEF. bool ISD::allOperandsUndef(const SDNode *N) { // Return false if the node has no operands. // This is "logically inconsistent" with the definition of "all" but // is probably the desired behavior. if (N->getNumOperands() == 0) return false; for (const SDValue &Op : N->op_values()) if (Op.getOpcode() != ISD::UNDEF) return false; return true; } ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) { switch (ExtType) { case ISD::EXTLOAD: return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND; case ISD::SEXTLOAD: return ISD::SIGN_EXTEND; case ISD::ZEXTLOAD: return ISD::ZERO_EXTEND; default: break; } llvm_unreachable("Invalid LoadExtType"); } /// getSetCCSwappedOperands - Return the operation corresponding to (Y op X) /// when given the operation for (X op Y). ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) { // To perform this operation, we just need to swap the L and G bits of the // operation. unsigned OldL = (Operation >> 2) & 1; unsigned OldG = (Operation >> 1) & 1; return ISD::CondCode((Operation & ~6) | // Keep the N, U, E bits (OldL << 1) | // New G bit (OldG << 2)); // New L bit. } /// getSetCCInverse - Return the operation corresponding to !(X op Y), where /// 'op' is a valid SetCC operation. ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) { unsigned Operation = Op; if (isInteger) Operation ^= 7; // Flip L, G, E bits, but not U. else Operation ^= 15; // Flip all of the condition bits. if (Operation > ISD::SETTRUE2) Operation &= ~8; // Don't let N and U bits get set. return ISD::CondCode(Operation); } /// isSignedOp - For an integer comparison, return 1 if the comparison is a /// signed operation and 2 if the result is an unsigned comparison. Return zero /// if the operation does not depend on the sign of the input (setne and seteq). static int isSignedOp(ISD::CondCode Opcode) { switch (Opcode) { default: llvm_unreachable("Illegal integer setcc operation!"); case ISD::SETEQ: case ISD::SETNE: return 0; case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE: return 1; case ISD::SETULT: case ISD::SETULE: case ISD::SETUGT: case ISD::SETUGE: return 2; } } /// getSetCCOrOperation - Return the result of a logical OR between different /// comparisons of identical values: ((X op1 Y) | (X op2 Y)). This function /// returns SETCC_INVALID if it is not possible to represent the resultant /// comparison. ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2, bool isInteger) { if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed integer setcc with an unsigned integer setcc. return ISD::SETCC_INVALID; unsigned Op = Op1 | Op2; // Combine all of the condition bits. // If the N and U bits get set then the resultant comparison DOES suddenly // care about orderedness, and is true when ordered. if (Op > ISD::SETTRUE2) Op &= ~16; // Clear the U bit if the N bit is set. // Canonicalize illegal integer setcc's. if (isInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT Op = ISD::SETNE; return ISD::CondCode(Op); } /// getSetCCAndOperation - Return the result of a logical AND between different /// comparisons of identical values: ((X op1 Y) & (X op2 Y)). This /// function returns zero if it is not possible to represent the resultant /// comparison. ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, bool isInteger) { if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed setcc with an unsigned setcc. return ISD::SETCC_INVALID; // Combine all of the condition bits. ISD::CondCode Result = ISD::CondCode(Op1 & Op2); // Canonicalize illegal integer setcc's. if (isInteger) { switch (Result) { default: break; case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT case ISD::SETOEQ: // SETEQ & SETU[LG]E case ISD::SETUEQ: Result = ISD::SETEQ ; break; // SETUGE & SETULE case ISD::SETOLT: Result = ISD::SETULT ; break; // SETULT & SETNE case ISD::SETOGT: Result = ISD::SETUGT ; break; // SETUGT & SETNE } } return Result; } //===----------------------------------------------------------------------===// // SDNode Profile Support //===----------------------------------------------------------------------===// /// AddNodeIDOpcode - Add the node opcode to the NodeID data. /// static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) { ID.AddInteger(OpC); } /// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them /// solely with their pointer. static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) { ID.AddPointer(VTList.VTs); } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. /// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef Ops) { for (auto& Op : Ops) { ID.AddPointer(Op.getNode()); ID.AddInteger(Op.getResNo()); } } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. /// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef Ops) { for (auto& Op : Ops) { ID.AddPointer(Op.getNode()); ID.AddInteger(Op.getResNo()); } } -/// Add logical or fast math flag values to FoldingSetNodeID value. -static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode, - const SDNodeFlags *Flags) { - if (!isBinOpWithFlags(Opcode)) - return; - - unsigned RawFlags = 0; - if (Flags) - RawFlags = Flags->getRawFlags(); - ID.AddInteger(RawFlags); -} - -static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) { - AddNodeIDFlags(ID, N->getOpcode(), N->getFlags()); -} - static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC, SDVTList VTList, ArrayRef OpList) { AddNodeIDOpcode(ID, OpC); AddNodeIDValueTypes(ID, VTList); AddNodeIDOperands(ID, OpList); } /// If this is an SDNode with special info, add this info to the NodeID data. static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { switch (N->getOpcode()) { case ISD::TargetExternalSymbol: case ISD::ExternalSymbol: case ISD::MCSymbol: llvm_unreachable("Should only be used on nodes with operands"); default: break; // Normal nodes don't need extra info. case ISD::TargetConstant: case ISD::Constant: { const ConstantSDNode *C = cast(N); ID.AddPointer(C->getConstantIntValue()); ID.AddBoolean(C->isOpaque()); break; } case ISD::TargetConstantFP: case ISD::ConstantFP: { ID.AddPointer(cast(N)->getConstantFPValue()); break; } case ISD::TargetGlobalAddress: case ISD::GlobalAddress: case ISD::TargetGlobalTLSAddress: case ISD::GlobalTLSAddress: { const GlobalAddressSDNode *GA = cast(N); ID.AddPointer(GA->getGlobal()); ID.AddInteger(GA->getOffset()); ID.AddInteger(GA->getTargetFlags()); ID.AddInteger(GA->getAddressSpace()); break; } case ISD::BasicBlock: ID.AddPointer(cast(N)->getBasicBlock()); break; case ISD::Register: ID.AddInteger(cast(N)->getReg()); break; case ISD::RegisterMask: ID.AddPointer(cast(N)->getRegMask()); break; case ISD::SRCVALUE: ID.AddPointer(cast(N)->getValue()); break; case ISD::FrameIndex: case ISD::TargetFrameIndex: ID.AddInteger(cast(N)->getIndex()); break; case ISD::JumpTable: case ISD::TargetJumpTable: ID.AddInteger(cast(N)->getIndex()); ID.AddInteger(cast(N)->getTargetFlags()); break; case ISD::ConstantPool: case ISD::TargetConstantPool: { const ConstantPoolSDNode *CP = cast(N); ID.AddInteger(CP->getAlignment()); ID.AddInteger(CP->getOffset()); if (CP->isMachineConstantPoolEntry()) CP->getMachineCPVal()->addSelectionDAGCSEId(ID); else ID.AddPointer(CP->getConstVal()); ID.AddInteger(CP->getTargetFlags()); break; } case ISD::TargetIndex: { const TargetIndexSDNode *TI = cast(N); ID.AddInteger(TI->getIndex()); ID.AddInteger(TI->getOffset()); ID.AddInteger(TI->getTargetFlags()); break; } case ISD::LOAD: { const LoadSDNode *LD = cast(N); ID.AddInteger(LD->getMemoryVT().getRawBits()); ID.AddInteger(LD->getRawSubclassData()); ID.AddInteger(LD->getPointerInfo().getAddrSpace()); break; } case ISD::STORE: { const StoreSDNode *ST = cast(N); ID.AddInteger(ST->getMemoryVT().getRawBits()); ID.AddInteger(ST->getRawSubclassData()); ID.AddInteger(ST->getPointerInfo().getAddrSpace()); break; } case ISD::ATOMIC_CMP_SWAP: case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: { const AtomicSDNode *AT = cast(N); ID.AddInteger(AT->getMemoryVT().getRawBits()); ID.AddInteger(AT->getRawSubclassData()); ID.AddInteger(AT->getPointerInfo().getAddrSpace()); break; } case ISD::PREFETCH: { const MemSDNode *PF = cast(N); ID.AddInteger(PF->getPointerInfo().getAddrSpace()); break; } case ISD::VECTOR_SHUFFLE: { const ShuffleVectorSDNode *SVN = cast(N); for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements(); i != e; ++i) ID.AddInteger(SVN->getMaskElt(i)); break; } case ISD::TargetBlockAddress: case ISD::BlockAddress: { const BlockAddressSDNode *BA = cast(N); ID.AddPointer(BA->getBlockAddress()); ID.AddInteger(BA->getOffset()); ID.AddInteger(BA->getTargetFlags()); break; } } // end switch (N->getOpcode()) - AddNodeIDFlags(ID, N); - // Target specific memory nodes could also have address spaces to check. if (N->isTargetMemoryOpcode()) ID.AddInteger(cast(N)->getPointerInfo().getAddrSpace()); } /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID /// data. static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) { AddNodeIDOpcode(ID, N->getOpcode()); // Add the return value info. AddNodeIDValueTypes(ID, N->getVTList()); // Add the operand info. AddNodeIDOperands(ID, N->ops()); // Handle SDNode leafs with special info. AddNodeIDCustom(ID, N); } /// encodeMemSDNodeFlags - Generic routine for computing a value for use in /// the CSE map that carries volatility, temporalness, indexing mode, and /// extension/truncation information. /// static inline unsigned encodeMemSDNodeFlags(int ConvType, ISD::MemIndexedMode AM, bool isVolatile, bool isNonTemporal, bool isInvariant) { assert((ConvType & 3) == ConvType && "ConvType may not require more than 2 bits!"); assert((AM & 7) == AM && "AM may not require more than 3 bits!"); return ConvType | (AM << 2) | (isVolatile << 5) | (isNonTemporal << 6) | (isInvariant << 7); } //===----------------------------------------------------------------------===// // SelectionDAG Class //===----------------------------------------------------------------------===// /// doNotCSE - Return true if CSE should not be performed for this node. static bool doNotCSE(SDNode *N) { if (N->getValueType(0) == MVT::Glue) return true; // Never CSE anything that produces a flag. switch (N->getOpcode()) { default: break; case ISD::HANDLENODE: case ISD::EH_LABEL: return true; // Never CSE these nodes. } // Check that remaining values produced are not flags. for (unsigned i = 1, e = N->getNumValues(); i != e; ++i) if (N->getValueType(i) == MVT::Glue) return true; // Never CSE anything that produces a flag. return false; } /// RemoveDeadNodes - This method deletes all unreachable nodes in the /// SelectionDAG. void SelectionDAG::RemoveDeadNodes() { // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted. HandleSDNode Dummy(getRoot()); SmallVector DeadNodes; // Add all obviously-dead nodes to the DeadNodes worklist. for (SDNode &Node : allnodes()) if (Node.use_empty()) DeadNodes.push_back(&Node); RemoveDeadNodes(DeadNodes); // If the root changed (e.g. it was a dead load, update the root). setRoot(Dummy.getValue()); } /// RemoveDeadNodes - This method deletes the unreachable nodes in the /// given list, and any nodes that become unreachable as a result. void SelectionDAG::RemoveDeadNodes(SmallVectorImpl &DeadNodes) { // Process the worklist, deleting the nodes and adding their uses to the // worklist. while (!DeadNodes.empty()) { SDNode *N = DeadNodes.pop_back_val(); for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeDeleted(N, nullptr); // Take the node out of the appropriate CSE map. RemoveNodeFromCSEMaps(N); // Next, brutally remove the operand list. This is safe to do, as there are // no cycles in the graph. for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) { SDUse &Use = *I++; SDNode *Operand = Use.getNode(); Use.set(SDValue()); // Now that we removed this operand, see if there are no uses of it left. if (Operand->use_empty()) DeadNodes.push_back(Operand); } DeallocateNode(N); } } void SelectionDAG::RemoveDeadNode(SDNode *N){ SmallVector DeadNodes(1, N); // Create a dummy node that adds a reference to the root node, preventing // it from being deleted. (This matters if the root is an operand of the // dead node.) HandleSDNode Dummy(getRoot()); RemoveDeadNodes(DeadNodes); } void SelectionDAG::DeleteNode(SDNode *N) { // First take this out of the appropriate CSE map. RemoveNodeFromCSEMaps(N); // Finally, remove uses due to operands of this node, remove from the // AllNodes list, and delete the node. DeleteNodeNotInCSEMaps(N); } void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) { assert(N != AllNodes.begin() && "Cannot delete the entry node!"); assert(N->use_empty() && "Cannot delete a node that is not dead!"); // Drop all of the operands and decrement used node's use counts. N->DropOperands(); DeallocateNode(N); } void SDDbgInfo::erase(const SDNode *Node) { DbgValMapType::iterator I = DbgValMap.find(Node); if (I == DbgValMap.end()) return; for (auto &Val: I->second) Val->setIsInvalidated(); DbgValMap.erase(I); } void SelectionDAG::DeallocateNode(SDNode *N) { if (N->OperandsNeedDelete) delete[] N->OperandList; // Set the opcode to DELETED_NODE to help catch bugs when node // memory is reallocated. N->NodeType = ISD::DELETED_NODE; NodeAllocator.Deallocate(AllNodes.remove(N)); // If any of the SDDbgValue nodes refer to this SDNode, invalidate // them and forget about that node. DbgInfo->erase(N); } #ifndef NDEBUG /// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid. static void VerifySDNode(SDNode *N) { switch (N->getOpcode()) { default: break; case ISD::BUILD_PAIR: { EVT VT = N->getValueType(0); assert(N->getNumValues() == 1 && "Too many results!"); assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) && "Wrong return type!"); assert(N->getNumOperands() == 2 && "Wrong number of operands!"); assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() && "Mismatched operand types!"); assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() && "Wrong operand type!"); assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() && "Wrong return type size"); break; } case ISD::BUILD_VECTOR: { assert(N->getNumValues() == 1 && "Too many results!"); assert(N->getValueType(0).isVector() && "Wrong return type!"); assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() && "Wrong number of operands!"); EVT EltVT = N->getValueType(0).getVectorElementType(); for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) { assert((I->getValueType() == EltVT || (EltVT.isInteger() && I->getValueType().isInteger() && EltVT.bitsLE(I->getValueType()))) && "Wrong operand type!"); assert(I->getValueType() == N->getOperand(0).getValueType() && "Operands must all have the same type"); } break; } } } #endif // NDEBUG /// \brief Insert a newly allocated node into the DAG. /// /// Handles insertion into the all nodes list and CSE map, as well as /// verification and other common operations when a new node is allocated. void SelectionDAG::InsertNode(SDNode *N) { AllNodes.push_back(N); #ifndef NDEBUG N->PersistentId = NextPersistentId++; VerifySDNode(N); #endif } /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that /// correspond to it. This is useful when we're about to delete or repurpose /// the node. We don't want future request for structurally identical nodes /// to return N anymore. bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) { bool Erased = false; switch (N->getOpcode()) { case ISD::HANDLENODE: return false; // noop. case ISD::CONDCODE: assert(CondCodeNodes[cast(N)->get()] && "Cond code doesn't exist!"); Erased = CondCodeNodes[cast(N)->get()] != nullptr; CondCodeNodes[cast(N)->get()] = nullptr; break; case ISD::ExternalSymbol: Erased = ExternalSymbols.erase(cast(N)->getSymbol()); break; case ISD::TargetExternalSymbol: { ExternalSymbolSDNode *ESN = cast(N); Erased = TargetExternalSymbols.erase( std::pair(ESN->getSymbol(), ESN->getTargetFlags())); break; } case ISD::MCSymbol: { auto *MCSN = cast(N); Erased = MCSymbols.erase(MCSN->getMCSymbol()); break; } case ISD::VALUETYPE: { EVT VT = cast(N)->getVT(); if (VT.isExtended()) { Erased = ExtendedValueTypeNodes.erase(VT); } else { Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr; ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr; } break; } default: // Remove it from the CSE Map. assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!"); assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!"); Erased = CSEMap.RemoveNode(N); break; } #ifndef NDEBUG // Verify that the node was actually in one of the CSE maps, unless it has a // flag result (which cannot be CSE'd) or is one of the special cases that are // not subject to CSE. if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue && !N->isMachineOpcode() && !doNotCSE(N)) { N->dump(this); dbgs() << "\n"; llvm_unreachable("Node is not in map!"); } #endif return Erased; } /// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE /// maps and modified in place. Add it back to the CSE maps, unless an identical /// node already exists, in which case transfer all its users to the existing /// node. This transfer can potentially trigger recursive merging. /// void SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) { // For node types that aren't CSE'd, just act as if no identical node // already exists. if (!doNotCSE(N)) { SDNode *Existing = CSEMap.GetOrInsertNode(N); if (Existing != N) { // If there was already an existing matching node, use ReplaceAllUsesWith // to replace the dead one with the existing one. This can cause // recursive merging of other unrelated nodes down the line. ReplaceAllUsesWith(N, Existing); // N is now dead. Inform the listeners and delete it. for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeDeleted(N, Existing); DeleteNodeNotInCSEMaps(N); return; } } // If the node doesn't already exist, we updated it. Inform listeners. for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeUpdated(N); } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a /// node already exists with these operands, the slot will be non-null. SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos) { if (doNotCSE(N)) return nullptr; SDValue Ops[] = { Op }; FoldingSetNodeID ID; AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos); + if (Node) + if (const SDNodeFlags *Flags = N->getFlags()) + Node->intersectFlagsWith(Flags); return Node; } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a /// node already exists with these operands, the slot will be non-null. SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2, void *&InsertPos) { if (doNotCSE(N)) return nullptr; SDValue Ops[] = { Op1, Op2 }; FoldingSetNodeID ID; AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos); + if (Node) + if (const SDNodeFlags *Flags = N->getFlags()) + Node->intersectFlagsWith(Flags); return Node; } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a /// node already exists with these operands, the slot will be non-null. SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef Ops, void *&InsertPos) { if (doNotCSE(N)) return nullptr; FoldingSetNodeID ID; AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos); + if (Node) + if (const SDNodeFlags *Flags = N->getFlags()) + Node->intersectFlagsWith(Flags); return Node; } /// getEVTAlignment - Compute the default alignment value for the /// given type. /// unsigned SelectionDAG::getEVTAlignment(EVT VT) const { Type *Ty = VT == MVT::iPTR ? PointerType::get(Type::getInt8Ty(*getContext()), 0) : VT.getTypeForEVT(*getContext()); return getDataLayout().getABITypeAlignment(Ty); } // EntryNode could meaningfully have debug info if we can find it... SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL), EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), Root(getEntryNode()), NewNodesMustHaveLegalTypes(false), UpdateListeners(nullptr) { InsertNode(&EntryNode); DbgInfo = new SDDbgInfo(); } void SelectionDAG::init(MachineFunction &mf) { MF = &mf; TLI = getSubtarget().getTargetLowering(); TSI = getSubtarget().getSelectionDAGInfo(); Context = &mf.getFunction()->getContext(); } SelectionDAG::~SelectionDAG() { assert(!UpdateListeners && "Dangling registered DAGUpdateListeners"); allnodes_clear(); delete DbgInfo; } void SelectionDAG::allnodes_clear() { assert(&*AllNodes.begin() == &EntryNode); AllNodes.remove(AllNodes.begin()); while (!AllNodes.empty()) DeallocateNode(&AllNodes.front()); #ifndef NDEBUG NextPersistentId = 0; #endif } BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N1, SDValue N2, const SDNodeFlags *Flags) { if (isBinOpWithFlags(Opcode)) { // If no flags were passed in, use a default flags object. SDNodeFlags F; if (Flags == nullptr) Flags = &F; BinaryWithFlagsSDNode *FN = new (NodeAllocator) BinaryWithFlagsSDNode( Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, *Flags); return FN; } BinarySDNode *N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2); return N; } SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) { SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos); if (N) { switch (N->getOpcode()) { default: break; case ISD::Constant: case ISD::ConstantFP: llvm_unreachable("Querying for Constant and ConstantFP nodes requires " "debug location. Use another overload."); } } return N; } SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID, DebugLoc DL, void *&InsertPos) { SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos); if (N) { switch (N->getOpcode()) { default: break; // Process only regular (non-target) constant nodes. case ISD::Constant: case ISD::ConstantFP: // Erase debug location from the node if the node is used at several // different places to do not propagate one location to all uses as it // leads to incorrect debug info. if (N->getDebugLoc() != DL) N->setDebugLoc(DebugLoc()); break; } } return N; } void SelectionDAG::clear() { allnodes_clear(); OperandAllocator.Reset(); CSEMap.clear(); ExtendedValueTypeNodes.clear(); ExternalSymbols.clear(); TargetExternalSymbols.clear(); MCSymbols.clear(); std::fill(CondCodeNodes.begin(), CondCodeNodes.end(), static_cast(nullptr)); std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(), static_cast(nullptr)); EntryNode.UseList = nullptr; InsertNode(&EntryNode); Root = getEntryNode(); DbgInfo->clear(); } SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : getNode(ISD::TRUNCATE, DL, VT, Op); } SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::SIGN_EXTEND, DL, VT, Op) : getNode(ISD::TRUNCATE, DL, VT, Op); } SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ZERO_EXTEND, DL, VT, Op) : getNode(ISD::TRUNCATE, DL, VT, Op); } SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT, EVT OpVT) { if (VT.bitsLE(Op.getValueType())) return getNode(ISD::TRUNCATE, SL, VT, Op); TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT); return getNode(TLI->getExtendForContent(BType), SL, VT, Op); } SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) { assert(!VT.isVector() && "getZeroExtendInReg should use the vector element type instead of " "the vector type!"); if (Op.getValueType() == VT) return Op; unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); APInt Imm = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); return getNode(ISD::AND, DL, Op.getValueType(), Op, getConstant(Imm, DL, Op.getValueType())); } SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { assert(VT.isVector() && "This DAG node is restricted to vector types."); assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && "The sizes of the input and result must match in order to perform the " "extend in-register."); assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op); } SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { assert(VT.isVector() && "This DAG node is restricted to vector types."); assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && "The sizes of the input and result must match in order to perform the " "extend in-register."); assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op); } SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) { assert(VT.isVector() && "This DAG node is restricted to vector types."); assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() && "The sizes of the input and result must match in order to perform the " "extend in-register."); assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op); } /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). /// SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue NegOne = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT); return getNode(ISD::XOR, DL, VT, Val, NegOne); } SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue TrueValue; switch (TLI->getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: case TargetLowering::UndefinedBooleanContent: TrueValue = getConstant(1, DL, VT); break; case TargetLowering::ZeroOrNegativeOneBooleanContent: TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT); break; } return getNode(ISD::XOR, DL, VT, Val, TrueValue); } SDValue SelectionDAG::getConstant(uint64_t Val, SDLoc DL, EVT VT, bool isT, bool isO) { EVT EltVT = VT.getScalarType(); assert((EltVT.getSizeInBits() >= 64 || (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) && "getConstant with a uint64_t value that doesn't fit in the type!"); return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO); } SDValue SelectionDAG::getConstant(const APInt &Val, SDLoc DL, EVT VT, bool isT, bool isO) { return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO); } SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT, bool isT, bool isO) { assert(VT.isInteger() && "Cannot create FP integer constant!"); EVT EltVT = VT.getScalarType(); const ConstantInt *Elt = &Val; // In some cases the vector type is legal but the element type is illegal and // needs to be promoted, for example v8i8 on ARM. In this case, promote the // inserted value (the type does not need to match the vector element type). // Any extra bits introduced will be truncated away. if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) == TargetLowering::TypePromoteInteger) { EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT); APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits()); Elt = ConstantInt::get(*getContext(), NewVal); } // In other cases the element type is illegal and needs to be expanded, for // example v2i64 on MIPS32. In this case, find the nearest legal type, split // the value into n parts and use a vector type with n-times the elements. // Then bitcast to the type requested. // Legalizing constants too early makes the DAGCombiner's job harder so we // only legalize if the DAG tells us we must produce legal types. else if (NewNodesMustHaveLegalTypes && VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) == TargetLowering::TypeExpandInteger) { APInt NewVal = Elt->getValue(); EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT); unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits(); unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits; EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts); // Check the temporary vector is the correct size. If this fails then // getTypeToTransformTo() probably returned a type whose size (in bits) // isn't a power-of-2 factor of the requested type size. assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits()); SmallVector EltParts; for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) { EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits) .trunc(ViaEltSizeInBits), DL, ViaEltVT, isT, isO)); } // EltParts is currently in little endian order. If we actually want // big-endian order then reverse it now. if (getDataLayout().isBigEndian()) std::reverse(EltParts.begin(), EltParts.end()); // The elements must be reversed when the element order is different // to the endianness of the elements (because the BITCAST is itself a // vector shuffle in this situation). However, we do not need any code to // perform this reversal because getConstant() is producing a vector // splat. // This situation occurs in MIPS MSA. SmallVector Ops; for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) Ops.insert(Ops.end(), EltParts.begin(), EltParts.end()); SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT, getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT, Ops)); return Result; } assert(Elt->getBitWidth() == EltVT.getSizeInBits() && "APInt size does not match type size!"); unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(EltVT), None); ID.AddPointer(Elt); ID.AddBoolean(isO); void *IP = nullptr; SDNode *N = nullptr; if ((N = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))) if (!VT.isVector()) return SDValue(N, 0); if (!N) { N = new (NodeAllocator) ConstantSDNode(isT, isO, Elt, DL.getDebugLoc(), EltVT); CSEMap.InsertNode(N, IP); InsertNode(N); } SDValue Result(N, 0); if (VT.isVector()) { SmallVector Ops; Ops.assign(VT.getVectorNumElements(), Result); Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops); } return Result; } SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, SDLoc DL, bool isTarget) { return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget); } SDValue SelectionDAG::getConstantFP(const APFloat& V, SDLoc DL, EVT VT, bool isTarget) { return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget); } SDValue SelectionDAG::getConstantFP(const ConstantFP& V, SDLoc DL, EVT VT, bool isTarget){ assert(VT.isFloatingPoint() && "Cannot create integer FP constant!"); EVT EltVT = VT.getScalarType(); // Do the map lookup using the actual bit pattern for the floating point // value, so that we don't have problems with 0.0 comparing equal to -0.0, and // we don't have issues with SNANs. unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(EltVT), None); ID.AddPointer(&V); void *IP = nullptr; SDNode *N = nullptr; if ((N = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))) if (!VT.isVector()) return SDValue(N, 0); if (!N) { N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, DL.getDebugLoc(), EltVT); CSEMap.InsertNode(N, IP); InsertNode(N); } SDValue Result(N, 0); if (VT.isVector()) { SmallVector Ops; Ops.assign(VT.getVectorNumElements(), Result); Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops); } return Result; } SDValue SelectionDAG::getConstantFP(double Val, SDLoc DL, EVT VT, bool isTarget) { EVT EltVT = VT.getScalarType(); if (EltVT==MVT::f32) return getConstantFP(APFloat((float)Val), DL, VT, isTarget); else if (EltVT==MVT::f64) return getConstantFP(APFloat(Val), DL, VT, isTarget); else if (EltVT==MVT::f80 || EltVT==MVT::f128 || EltVT==MVT::ppcf128 || EltVT==MVT::f16) { bool ignored; APFloat apf = APFloat(Val); apf.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven, &ignored); return getConstantFP(apf, DL, VT, isTarget); } else llvm_unreachable("Unsupported type in getConstantFP"); } SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL, EVT VT, int64_t Offset, bool isTargetGA, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTargetGA) && "Cannot set target flags on target-independent globals"); // Truncate (with sign-extension) the offset value to the pointer size. unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType()); if (BitWidth < 64) Offset = SignExtend64(Offset, BitWidth); unsigned Opc; if (GV->isThreadLocal()) Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress; else Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddPointer(GV); ID.AddInteger(Offset); ID.AddInteger(TargetFlags); ID.AddInteger(GV->getType()->getAddressSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) { unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(FI); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent jump tables"); unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(JTI); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT, unsigned Alignment, int Offset, bool isTarget, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); if (Alignment == 0) Alignment = getDataLayout().getPrefTypeAlignment(C->getType()); unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(Alignment); ID.AddInteger(Offset); ID.AddPointer(C); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT, unsigned Alignment, int Offset, bool isTarget, unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); if (Alignment == 0) Alignment = getDataLayout().getPrefTypeAlignment(C->getType()); unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddInteger(Alignment); ID.AddInteger(Offset); C->addSelectionDAGCSEId(ID); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset, unsigned char TargetFlags) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None); ID.AddInteger(Index); ID.AddInteger(Offset); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None); ID.AddPointer(MBB); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getValueType(EVT VT) { if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >= ValueTypeNodes.size()) ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1); SDNode *&N = VT.isExtended() ? ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy]; if (N) return SDValue(N, 0); N = new (NodeAllocator) VTSDNode(VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) { SDNode *&N = ExternalSymbols[Sym]; if (N) return SDValue(N, 0); N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) { SDNode *&N = MCSymbols[Sym]; if (N) return SDValue(N, 0); N = new (NodeAllocator) MCSymbolSDNode(Sym, VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT, unsigned char TargetFlags) { SDNode *&N = TargetExternalSymbols[std::pair(Sym, TargetFlags)]; if (N) return SDValue(N, 0); N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) { if ((unsigned)Cond >= CondCodeNodes.size()) CondCodeNodes.resize(Cond+1); if (!CondCodeNodes[Cond]) { CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond); CondCodeNodes[Cond] = N; InsertNode(N); } return SDValue(CondCodeNodes[Cond], 0); } // commuteShuffle - swaps the values of N1 and N2, and swaps all indices in // the shuffle mask M that point at N1 to point at N2, and indices that point // N2 to point at N1. static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl &M) { std::swap(N1, N2); ShuffleVectorSDNode::commuteMask(M); } SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2, const int *Mask) { assert(VT == N1.getValueType() && VT == N2.getValueType() && "Invalid VECTOR_SHUFFLE"); // Canonicalize shuffle undef, undef -> undef if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF) return getUNDEF(VT); // Validate that all indices in Mask are within the range of the elements // input to the shuffle. unsigned NElts = VT.getVectorNumElements(); SmallVector MaskVec; for (unsigned i = 0; i != NElts; ++i) { assert(Mask[i] < (int)(NElts * 2) && "Index out of range"); MaskVec.push_back(Mask[i]); } // Canonicalize shuffle v, v -> v, undef if (N1 == N2) { N2 = getUNDEF(VT); for (unsigned i = 0; i != NElts; ++i) if (MaskVec[i] >= (int)NElts) MaskVec[i] -= NElts; } // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. if (N1.getOpcode() == ISD::UNDEF) commuteShuffle(N1, N2, MaskVec); // If shuffling a splat, try to blend the splat instead. We do this here so // that even when this arises during lowering we don't have to re-handle it. auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) { BitVector UndefElements; SDValue Splat = BV->getSplatValue(&UndefElements); if (!Splat) return; for (int i = 0; i < (int)NElts; ++i) { if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + (int)NElts)) continue; // If this input comes from undef, mark it as such. if (UndefElements[MaskVec[i] - Offset]) { MaskVec[i] = -1; continue; } // If we can blend a non-undef lane, use that instead. if (!UndefElements[i]) MaskVec[i] = i + Offset; } }; if (auto *N1BV = dyn_cast(N1)) BlendSplat(N1BV, 0); if (auto *N2BV = dyn_cast(N2)) BlendSplat(N2BV, NElts); // Canonicalize all index into lhs, -> shuffle lhs, undef // Canonicalize all index into rhs, -> shuffle rhs, undef bool AllLHS = true, AllRHS = true; bool N2Undef = N2.getOpcode() == ISD::UNDEF; for (unsigned i = 0; i != NElts; ++i) { if (MaskVec[i] >= (int)NElts) { if (N2Undef) MaskVec[i] = -1; else AllLHS = false; } else if (MaskVec[i] >= 0) { AllRHS = false; } } if (AllLHS && AllRHS) return getUNDEF(VT); if (AllLHS && !N2Undef) N2 = getUNDEF(VT); if (AllRHS) { N1 = getUNDEF(VT); commuteShuffle(N1, N2, MaskVec); } // Reset our undef status after accounting for the mask. N2Undef = N2.getOpcode() == ISD::UNDEF; // Re-check whether both sides ended up undef. if (N1.getOpcode() == ISD::UNDEF && N2Undef) return getUNDEF(VT); // If Identity shuffle return that node. bool Identity = true, AllSame = true; for (unsigned i = 0; i != NElts; ++i) { if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false; if (MaskVec[i] != MaskVec[0]) AllSame = false; } if (Identity && NElts) return N1; // Shuffling a constant splat doesn't change the result. if (N2Undef) { SDValue V = N1; // Look through any bitcasts. We check that these don't change the number // (and size) of elements and just changes their types. while (V.getOpcode() == ISD::BITCAST) V = V->getOperand(0); // A splat should always show up as a build vector node. if (auto *BV = dyn_cast(V)) { BitVector UndefElements; SDValue Splat = BV->getSplatValue(&UndefElements); // If this is a splat of an undef, shuffling it is also undef. if (Splat && Splat.getOpcode() == ISD::UNDEF) return getUNDEF(VT); bool SameNumElts = V.getValueType().getVectorNumElements() == VT.getVectorNumElements(); // We only have a splat which can skip shuffles if there is a splatted // value and no undef lanes rearranged by the shuffle. if (Splat && UndefElements.none()) { // Splat of , return , provided that the // number of elements match or the value splatted is a zero constant. if (SameNumElts) return N1; if (auto *C = dyn_cast(Splat)) if (C->isNullValue()) return N1; } // If the shuffle itself creates a splat, build the vector directly. if (AllSame && SameNumElts) { const SDValue &Splatted = BV->getOperand(MaskVec[0]); SmallVector Ops(NElts, Splatted); EVT BuildVT = BV->getValueType(0); SDValue NewBV = getNode(ISD::BUILD_VECTOR, dl, BuildVT, Ops); // We may have jumped through bitcasts, so the type of the // BUILD_VECTOR may not match the type of the shuffle. if (BuildVT != VT) NewBV = getNode(ISD::BITCAST, dl, VT, NewBV); return NewBV; } } } FoldingSetNodeID ID; SDValue Ops[2] = { N1, N2 }; AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops); for (unsigned i = 0; i != NElts; ++i) ID.AddInteger(MaskVec[i]); void* IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) return SDValue(E, 0); // Allocate the mask array for the node out of the BumpPtrAllocator, since // SDNode doesn't have access to it. This memory will be "leaked" when // the node is deallocated, but recovered when the NodeAllocator is released. int *MaskAlloc = OperandAllocator.Allocate(NElts); memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int)); ShuffleVectorSDNode *N = new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), N1, N2, MaskAlloc); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) { MVT VT = SV.getSimpleValueType(0); SmallVector MaskVec(SV.getMask().begin(), SV.getMask().end()); ShuffleVectorSDNode::commuteMask(MaskVec); SDValue Op0 = SV.getOperand(0); SDValue Op1 = SV.getOperand(1); return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, &MaskVec[0]); } SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl, SDValue Val, SDValue DTy, SDValue STy, SDValue Rnd, SDValue Sat, ISD::CvtCode Code) { // If the src and dest types are the same and the conversion is between // integer types of the same sign or two floats, no conversion is necessary. if (DTy == STy && (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF)) return Val; FoldingSetNodeID ID; SDValue Ops[] = { Val, DTy, STy, Rnd, Sat }; AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops); void* IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) return SDValue(E, 0); CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), Ops, Code); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::Register, getVTList(VT), None); ID.AddInteger(RegNo); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None); ID.AddPointer(RegMask); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) { FoldingSetNodeID ID; SDValue Ops[] = { Root }; AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), Ops); ID.AddPointer(Label); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(), dl.getDebugLoc(), Root, Label); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset, bool isTarget, unsigned char TargetFlags) { unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); ID.AddPointer(BA); ID.AddInteger(Offset); ID.AddInteger(TargetFlags); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getSrcValue(const Value *V) { assert((!V || V->getType()->isPointerTy()) && "SrcValue is not a pointer?"); FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None); ID.AddPointer(V); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) SrcValueSDNode(V); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } /// getMDNode - Return an MDNodeSDNode which holds an MDNode. SDValue SelectionDAG::getMDNode(const MDNode *MD) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None); ID.AddPointer(MD); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) MDNodeSDNode(MD); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) { if (VT == V.getValueType()) return V; return getNode(ISD::BITCAST, SDLoc(V), VT, V); } /// getAddrSpaceCast - Return an AddrSpaceCastSDNode. SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS) { SDValue Ops[] = {Ptr}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops); ID.AddInteger(SrcAS); ID.AddInteger(DestAS); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) AddrSpaceCastSDNode(dl.getIROrder(), dl.getDebugLoc(), VT, Ptr, SrcAS, DestAS); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } /// getShiftAmountOperand - Return the specified value casted to /// the target's desired shift amount type. SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { EVT OpTy = Op.getValueType(); EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout()); if (OpTy == ShTy || OpTy.isVector()) return Op; return getZExtOrTrunc(Op, SDLoc(Op), ShTy); } SDValue SelectionDAG::expandVAArg(SDNode *Node) { SDLoc dl(Node); const TargetLowering &TLI = getTargetLoweringInfo(); const Value *V = cast(Node->getOperand(2))->getValue(); EVT VT = Node->getValueType(0); SDValue Tmp1 = Node->getOperand(0); SDValue Tmp2 = Node->getOperand(1); unsigned Align = Node->getConstantOperandVal(3); SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1, Tmp2, MachinePointerInfo(V), false, false, false, 0); SDValue VAList = VAListLoad; if (Align > TLI.getMinStackArgumentAlignment()) { assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2"); VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList, getConstant(Align - 1, dl, VAList.getValueType())); VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList, getConstant(-(int64_t)Align, dl, VAList.getValueType())); } // Increment the pointer, VAList, to the next vaarg Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList, getConstant(getDataLayout().getTypeAllocSize( VT.getTypeForEVT(*getContext())), dl, VAList.getValueType())); // Store the incremented VAList to the legalized pointer Tmp1 = getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V), false, false, 0); // Load the actual argument out of the pointer VAList return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo(), false, false, false, 0); } SDValue SelectionDAG::expandVACopy(SDNode *Node) { SDLoc dl(Node); const TargetLowering &TLI = getTargetLoweringInfo(); // This defaults to loading a pointer from the input and storing it to the // output, returning the chain. const Value *VD = cast(Node->getOperand(3))->getValue(); const Value *VS = cast(Node->getOperand(4))->getValue(); SDValue Tmp1 = getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0), Node->getOperand(2), MachinePointerInfo(VS), false, false, false, 0); return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1), MachinePointerInfo(VD), false, false, 0); } /// CreateStackTemporary - Create a stack temporary, suitable for holding the /// specified value type. SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) { MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo(); unsigned ByteSize = VT.getStoreSize(); Type *Ty = VT.getTypeForEVT(*getContext()); unsigned StackAlign = std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign); int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); } /// CreateStackTemporary - Create a stack temporary suitable for holding /// either of the specified value types. SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize()); Type *Ty1 = VT1.getTypeForEVT(*getContext()); Type *Ty2 = VT2.getTypeForEVT(*getContext()); const DataLayout &DL = getDataLayout(); unsigned Align = std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2)); MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false); return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); } SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond, SDLoc dl) { // These setcc operations always fold. switch (Cond) { default: break; case ISD::SETFALSE: case ISD::SETFALSE2: return getConstant(0, dl, VT); case ISD::SETTRUE: case ISD::SETTRUE2: { TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(N1->getValueType(0)); return getConstant( Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl, VT); } case ISD::SETOEQ: case ISD::SETOGT: case ISD::SETOGE: case ISD::SETOLT: case ISD::SETOLE: case ISD::SETONE: case ISD::SETO: case ISD::SETUO: case ISD::SETUEQ: case ISD::SETUNE: assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!"); break; } if (ConstantSDNode *N2C = dyn_cast(N2)) { const APInt &C2 = N2C->getAPIntValue(); if (ConstantSDNode *N1C = dyn_cast(N1)) { const APInt &C1 = N1C->getAPIntValue(); switch (Cond) { default: llvm_unreachable("Unknown integer setcc!"); case ISD::SETEQ: return getConstant(C1 == C2, dl, VT); case ISD::SETNE: return getConstant(C1 != C2, dl, VT); case ISD::SETULT: return getConstant(C1.ult(C2), dl, VT); case ISD::SETUGT: return getConstant(C1.ugt(C2), dl, VT); case ISD::SETULE: return getConstant(C1.ule(C2), dl, VT); case ISD::SETUGE: return getConstant(C1.uge(C2), dl, VT); case ISD::SETLT: return getConstant(C1.slt(C2), dl, VT); case ISD::SETGT: return getConstant(C1.sgt(C2), dl, VT); case ISD::SETLE: return getConstant(C1.sle(C2), dl, VT); case ISD::SETGE: return getConstant(C1.sge(C2), dl, VT); } } } if (ConstantFPSDNode *N1C = dyn_cast(N1)) { if (ConstantFPSDNode *N2C = dyn_cast(N2)) { APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF()); switch (Cond) { default: break; case ISD::SETEQ: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, dl, VT); case ISD::SETNE: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpLessThan, dl, VT); case ISD::SETLT: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, dl, VT); case ISD::SETGT: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, dl, VT); case ISD::SETLE: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan || R==APFloat::cmpEqual, dl, VT); case ISD::SETGE: if (R==APFloat::cmpUnordered) return getUNDEF(VT); // fall through case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpEqual, dl, VT); case ISD::SETO: return getConstant(R!=APFloat::cmpUnordered, dl, VT); case ISD::SETUO: return getConstant(R==APFloat::cmpUnordered, dl, VT); case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered || R==APFloat::cmpEqual, dl, VT); case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, dl, VT); case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered || R==APFloat::cmpLessThan, dl, VT); case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpUnordered, dl, VT); case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, dl, VT); case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, dl, VT); } } else { // Ensure that the constant occurs on the RHS. ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond); MVT CompVT = N1.getValueType().getSimpleVT(); if (!TLI->isCondCodeLegal(SwappedCond, CompVT)) return SDValue(); return getSetCC(dl, VT, N2, N1, SwappedCond); } } // Could not fold it. return SDValue(); } /// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We /// use this predicate to simplify operations downstream. bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { // This predicate is not safe for vector operations. if (Op.getValueType().isVector()) return false; unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth); } /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use /// this predicate to simplify operations downstream. Mask is known to be zero /// for bits that V cannot have. bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth) const { APInt KnownZero, KnownOne; computeKnownBits(Op, KnownZero, KnownOne, Depth); return (KnownZero & Mask) == Mask; } /// Determine which bits of Op are known to be either zero or one and return /// them in the KnownZero/KnownOne bitsets. void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, unsigned Depth) const { unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits(); KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. if (Depth == 6) return; // Limit search depth. APInt KnownZero2, KnownOne2; switch (Op.getOpcode()) { case ISD::Constant: // We know all of the bits for a constant! KnownOne = cast(Op)->getAPIntValue(); KnownZero = ~KnownOne; break; case ISD::AND: // If either the LHS or the RHS are Zero, the result is zero. computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // Output known-1 bits are only known if set in both the LHS & RHS. KnownOne &= KnownOne2; // Output known-0 are known to be clear if zero in either the LHS | RHS. KnownZero |= KnownZero2; break; case ISD::OR: computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // Output known-0 bits are only known if clear in both the LHS & RHS. KnownZero &= KnownZero2; // Output known-1 are known to be set if set in either the LHS | RHS. KnownOne |= KnownOne2; break; case ISD::XOR: { computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // Output known-0 bits are known if clear or set in both the LHS & RHS. APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); // Output known-1 are known to be set if set in only one of the LHS, RHS. KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); KnownZero = KnownZeroOut; break; } case ISD::MUL: { computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); // If low bits are zero in either operand, output low known-0 bits. // Also compute a conserative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. KnownOne.clearAllBits(); unsigned TrailZ = KnownZero.countTrailingOnes() + KnownZero2.countTrailingOnes(); unsigned LeadZ = std::max(KnownZero.countLeadingOnes() + KnownZero2.countLeadingOnes(), BitWidth) - BitWidth; TrailZ = std::min(TrailZ, BitWidth); LeadZ = std::min(LeadZ, BitWidth); KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | APInt::getHighBitsSet(BitWidth, LeadZ); break; } case ISD::UDIV: { // For the purposes of computing leading zeros we can conservatively // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); unsigned LeadZ = KnownZero2.countLeadingOnes(); KnownOne2.clearAllBits(); KnownZero2.clearAllBits(); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros(); if (RHSUnknownLeadingOnes != BitWidth) LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); break; } case ISD::SELECT: computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); // Only known if known in both the LHS and RHS. KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; case ISD::SELECT_CC: computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1); // Only known if known in both the LHS and RHS. KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) break; // The boolean result conforms to getBooleanContents. // If we know the result of a setcc has the top bits zero, use this info. // We know that we have an integer-based boolean since these operations // are only available for integer. if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SETCC: // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SHL: // (shl X, C1) & C2 == 0 iff (X & C2 >>u C1) == 0 if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. if (ShAmt >= BitWidth) break; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero <<= ShAmt; KnownOne <<= ShAmt; // low bits known zero. KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt); } break; case ISD::SRL: // (ushr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0 if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. if (ShAmt >= BitWidth) break; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.lshr(ShAmt); KnownOne = KnownOne.lshr(ShAmt); APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); KnownZero |= HighBits; // High bits known zero. } break; case ISD::SRA: if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. if (ShAmt >= BitWidth) break; // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.lshr(ShAmt); KnownOne = KnownOne.lshr(ShAmt); // Handle the sign bits. APInt SignBit = APInt::getSignBit(BitWidth); SignBit = SignBit.lshr(ShAmt); // Adjust to where it is now in the mask. if (KnownZero.intersects(SignBit)) { KnownZero |= HighBits; // New bits are known zero. } else if (KnownOne.intersects(SignBit)) { KnownOne |= HighBits; // New bits are known one. } } break; case ISD::SIGN_EXTEND_INREG: { EVT EVT = cast(Op.getOperand(1))->getVT(); unsigned EBits = EVT.getScalarType().getSizeInBits(); // Sign extension. Compute the demanded bits in the result that are not // present in the input. APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits); APInt InSignBit = APInt::getSignBit(EBits); APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits); // If the sign extended bits are demanded, we know that the sign // bit is demanded. InSignBit = InSignBit.zext(BitWidth); if (NewBits.getBoolValue()) InputDemandedBits |= InSignBit; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownOne &= InputDemandedBits; KnownZero &= InputDemandedBits; // If the sign bit of the input is known set or clear, then we know the // top bits of the result. if (KnownZero.intersects(InSignBit)) { // Input sign bit known clear KnownZero |= NewBits; KnownOne &= ~NewBits; } else if (KnownOne.intersects(InSignBit)) { // Input sign bit known set KnownOne |= NewBits; KnownZero &= ~NewBits; } else { // Input sign bit unknown KnownZero &= ~NewBits; KnownOne &= ~NewBits; } break; } case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: { unsigned LowBits = Log2_32(BitWidth)+1; KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); KnownOne.clearAllBits(); break; } case ISD::LOAD: { LoadSDNode *LD = cast(Op); // If this is a ZEXTLoad and we are looking at the loaded value. if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) { EVT VT = LD->getMemoryVT(); unsigned MemBits = VT.getScalarType().getSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { if (LD->getExtensionType() == ISD::NON_EXTLOAD) computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne); } break; } case ISD::ZERO_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); KnownZero |= NewBits; break; } case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); // Note if the sign bit is known to be zero or one. bool SignBitKnownZero = KnownZero.isNegative(); bool SignBitKnownOne = KnownOne.isNegative(); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); // If the sign bit is known zero or one, the top bits match. if (SignBitKnownZero) KnownZero |= NewBits; else if (SignBitKnownOne) KnownOne |= NewBits; break; } case ISD::ANY_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); break; } case ISD::TRUNCATE: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarType().getSizeInBits(); KnownZero = KnownZero.zext(InBits); KnownOne = KnownOne.zext(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero = KnownZero.trunc(BitWidth); KnownOne = KnownOne.trunc(BitWidth); break; } case ISD::AssertZext: { EVT VT = cast(Op.getOperand(1))->getVT(); APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero |= (~InMask); KnownOne &= (~KnownZero); break; } case ISD::FGETSIGN: // All bits are zero except the low bit. KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::SUB: { if (ConstantSDNode *CLHS = dyn_cast(Op.getOperand(0))) { // We know that the top bits of C-X are clear if X contains less bits // than C (i.e. no wrap-around can happen). For example, 20-X is // positive if we can prove that X is >= 0 and < 16. if (CLHS->getAPIntValue().isNonNegative()) { unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros(); // NLZ can't be BitWidth with no sign bit APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); // If all of the MaskV bits are known to be zero, then we know the // output top bits are zero, because we now know that the output is // from [0-C]. if ((KnownZero2 & MaskV) == MaskV) { unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros(); // Top bits known zero. KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); } } } } // fall through case ISD::ADD: case ISD::ADDE: { // Output known-0 bits are known if clear or set in both the low clear bits // common to both LHS & RHS. For example, 8+(X<<3) is known to have the // low 3 bits clear. // Output known-0 bits are also known if the top bits of each input are // known to be clear. For example, if one input has the top 10 bits clear // and the other has the top 8 bits clear, we know the top 7 bits of the // output must be clear. computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); unsigned KnownZeroHigh = KnownZero2.countLeadingOnes(); unsigned KnownZeroLow = KnownZero2.countTrailingOnes(); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); KnownZeroHigh = std::min(KnownZeroHigh, KnownZero2.countLeadingOnes()); KnownZeroLow = std::min(KnownZeroLow, KnownZero2.countTrailingOnes()); if (Op.getOpcode() == ISD::ADD) { KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow); if (KnownZeroHigh > 1) KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1); break; } // With ADDE, a carry bit may be added in, so we can only use this // information if we know (at least) that the low two bits are clear. We // then return to the caller that the low bit is unknown but that other bits // are known zero. if (KnownZeroLow >= 2) // ADDE KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow); break; } case ISD::SREM: if (ConstantSDNode *Rem = dyn_cast(Op.getOperand(1))) { const APInt &RA = Rem->getAPIntValue().abs(); if (RA.isPowerOf2()) { APInt LowBits = RA - 1; computeKnownBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1); // The low bits of the first operand are unchanged by the srem. KnownZero = KnownZero2 & LowBits; KnownOne = KnownOne2 & LowBits; // If the first operand is non-negative or has all low bits zero, then // the upper bits are all zero. if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits)) KnownZero |= ~LowBits; // If the first operand is negative and not all low bits are zero, then // the upper bits are all one. if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0)) KnownOne |= ~LowBits; assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); } } break; case ISD::UREM: { if (ConstantSDNode *Rem = dyn_cast(Op.getOperand(1))) { const APInt &RA = Rem->getAPIntValue(); if (RA.isPowerOf2()) { APInt LowBits = (RA - 1); computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth + 1); // The upper bits are all zero, the lower ones are unchanged. KnownZero = KnownZero2 | ~LowBits; KnownOne = KnownOne2 & LowBits; break; } } // Since the result is less than or equal to either operand, any leading // zero bits in either operand must also exist in the result. computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); uint32_t Leaders = std::max(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes()); KnownOne.clearAllBits(); KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); break; } case ISD::EXTRACT_ELEMENT: { computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); const unsigned Index = cast(Op.getOperand(1))->getZExtValue(); const unsigned BitWidth = Op.getValueType().getSizeInBits(); // Remove low part of known bits mask KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth); KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth); // Remove high part of known bit mask KnownZero = KnownZero.trunc(BitWidth); KnownOne = KnownOne.trunc(BitWidth); break; } case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: { APInt Op0Zero, Op0One; APInt Op1Zero, Op1One; computeKnownBits(Op.getOperand(0), Op0Zero, Op0One, Depth); computeKnownBits(Op.getOperand(1), Op1Zero, Op1One, Depth); KnownZero = Op0Zero & Op1Zero; KnownOne = Op0One & Op1One; break; } case ISD::FrameIndex: case ISD::TargetFrameIndex: if (unsigned Align = InferPtrAlignment(Op)) { // The low bits are known zero if the pointer is aligned. KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align)); break; } break; default: if (Op.getOpcode() < ISD::BUILTIN_OP_END) break; // Fallthrough case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_VOID: // Allow the target to implement this method for its nodes. TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth); break; } assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); } /// ComputeNumSignBits - Return the number of times the sign bit of the /// register is replicated into the other bits. We know that at least 1 bit /// is always equal to the sign bit (itself), but other cases can give us /// information. For example, immediately after an "SRA X, 2", we know that /// the top 3 bits are all equal to each other, so we return 3. unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{ EVT VT = Op.getValueType(); assert(VT.isInteger() && "Invalid VT!"); unsigned VTBits = VT.getScalarType().getSizeInBits(); unsigned Tmp, Tmp2; unsigned FirstAnswer = 1; if (Depth == 6) return 1; // Limit search depth. switch (Op.getOpcode()) { default: break; case ISD::AssertSext: Tmp = cast(Op.getOperand(1))->getVT().getSizeInBits(); return VTBits-Tmp+1; case ISD::AssertZext: Tmp = cast(Op.getOperand(1))->getVT().getSizeInBits(); return VTBits-Tmp; case ISD::Constant: { const APInt &Val = cast(Op)->getAPIntValue(); return Val.getNumSignBits(); } case ISD::SIGN_EXTEND: Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; case ISD::SIGN_EXTEND_INREG: // Max of the input and what this extends. Tmp = cast(Op.getOperand(1))->getVT().getScalarType().getSizeInBits(); Tmp = VTBits-Tmp+1; Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1); return std::max(Tmp, Tmp2); case ISD::SRA: Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); // SRA X, C -> adds C sign bits. if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Tmp += C->getZExtValue(); if (Tmp > VTBits) Tmp = VTBits; } return Tmp; case ISD::SHL: if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { // shl destroys sign bits. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (C->getZExtValue() >= VTBits || // Bad shift. C->getZExtValue() >= Tmp) break; // Shifted all sign bits out. return Tmp - C->getZExtValue(); } break; case ISD::AND: case ISD::OR: case ISD::XOR: // NOT is handled here. // Logical binary ops preserve the number of sign bits at the worst. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp != 1) { Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1); FirstAnswer = std::min(Tmp, Tmp2); // We computed what we know about the sign bits as our first // answer. Now proceed to the generic code that uses // computeKnownBits, and pick whichever answer is better. } break; case ISD::SELECT: Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1); if (Tmp == 1) return 1; // Early out. Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1); return std::min(Tmp, Tmp2); case ISD::SELECT_CC: Tmp = ComputeNumSignBits(Op.getOperand(2), Depth+1); if (Tmp == 1) return 1; // Early out. Tmp2 = ComputeNumSignBits(Op.getOperand(3), Depth+1); return std::min(Tmp, Tmp2); case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1); if (Tmp == 1) return 1; // Early out. Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1); return std::min(Tmp, Tmp2); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) break; // The boolean result conforms to getBooleanContents. Fall through. // If setcc returns 0/-1, all bits are sign bits. // We know that we have an integer-based boolean since these operations // are only available for integer. if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrNegativeOneBooleanContent) return VTBits; break; case ISD::SETCC: // If setcc returns 0/-1, all bits are sign bits. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) return VTBits; break; case ISD::ROTL: case ISD::ROTR: if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { unsigned RotAmt = C->getZExtValue() & (VTBits-1); // Handle rotate right by N like a rotate left by 32-N. if (Op.getOpcode() == ISD::ROTR) RotAmt = (VTBits-RotAmt) & (VTBits-1); // If we aren't rotating out all of the known-in sign bits, return the // number that are left. This handles rotl(sext(x), 1) for example. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp > RotAmt+1) return Tmp-RotAmt; } break; case ISD::ADD: // Add can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp == 1) return 1; // Early out. // Special case decrementing a value (ADD X, -1): if (ConstantSDNode *CRHS = dyn_cast(Op.getOperand(1))) if (CRHS->isAllOnesValue()) { APInt KnownZero, KnownOne; computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) return VTBits; // If we are subtracting one from a positive number, there is no carry // out of the result. if (KnownZero.isNegative()) return Tmp; } Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1); if (Tmp2 == 1) return 1; return std::min(Tmp, Tmp2)-1; case ISD::SUB: Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1); if (Tmp2 == 1) return 1; // Handle NEG. if (ConstantSDNode *CLHS = dyn_cast(Op.getOperand(0))) if (CLHS->isNullValue()) { APInt KnownZero, KnownOne; computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) return VTBits; // If the input is known to be positive (the sign bit is known clear), // the output of the NEG has the same number of sign bits as the input. if (KnownZero.isNegative()) return Tmp2; // Otherwise, we treat this like a SUB. } // Sub can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp == 1) return 1; // Early out. return std::min(Tmp, Tmp2)-1; case ISD::TRUNCATE: // FIXME: it's tricky to do anything useful for this, but it is an important // case for targets like X86. break; case ISD::EXTRACT_ELEMENT: { const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1); const int BitWidth = Op.getValueType().getSizeInBits(); const int Items = Op.getOperand(0).getValueType().getSizeInBits() / BitWidth; // Get reverse index (starting from 1), Op1 value indexes elements from // little end. Sign starts at big end. const int rIndex = Items - 1 - cast(Op.getOperand(1))->getZExtValue(); // If the sign portion ends in our element the subtraction gives correct // result. Otherwise it gives either negative or > bitwidth result return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0); } } // If we are looking at the loaded value of the SDNode. if (Op.getResNo() == 0) { // Handle LOADX separately here. EXTLOAD case will fallthrough. if (LoadSDNode *LD = dyn_cast(Op)) { unsigned ExtType = LD->getExtensionType(); switch (ExtType) { default: break; case ISD::SEXTLOAD: // '17' bits known Tmp = LD->getMemoryVT().getScalarType().getSizeInBits(); return VTBits-Tmp+1; case ISD::ZEXTLOAD: // '16' bits known Tmp = LD->getMemoryVT().getScalarType().getSizeInBits(); return VTBits-Tmp; } } } // Allow the target to implement this method for its nodes. if (Op.getOpcode() >= ISD::BUILTIN_OP_END || Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || Op.getOpcode() == ISD::INTRINSIC_VOID) { unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth); if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits); } // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. APInt KnownZero, KnownOne; computeKnownBits(Op, KnownZero, KnownOne, Depth); APInt Mask; if (KnownZero.isNegative()) { // sign bit is 0 Mask = KnownZero; } else if (KnownOne.isNegative()) { // sign bit is 1; Mask = KnownOne; } else { // Nothing known. return FirstAnswer; } // Okay, we know that the sign bit in Mask is set. Use CLZ to determine // the number of identical bits in the top of the input value. Mask = ~Mask; Mask <<= Mask.getBitWidth()-VTBits; // Return # leading zeros. We use 'min' here in case Val was zero before // shifting. We don't want to return '64' as for an i32 "0". return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros())); } /// isBaseWithConstantOffset - Return true if the specified operand is an /// ISD::ADD with a ConstantSDNode on the right-hand side, or if it is an /// ISD::OR with a ConstantSDNode that is guaranteed to have the same /// semantics as an ADD. This handles the equivalence: /// X|Cst == X+Cst iff X&Cst = 0. bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const { if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) || !isa(Op.getOperand(1))) return false; if (Op.getOpcode() == ISD::OR && !MaskedValueIsZero(Op.getOperand(0), cast(Op.getOperand(1))->getAPIntValue())) return false; return true; } bool SelectionDAG::isKnownNeverNaN(SDValue Op) const { // If we're told that NaNs won't happen, assume they won't. if (getTarget().Options.NoNaNsFPMath) return true; // If the value is a constant, we can obviously see if it is a NaN or not. if (const ConstantFPSDNode *C = dyn_cast(Op)) return !C->getValueAPF().isNaN(); // TODO: Recognize more cases here. return false; } bool SelectionDAG::isKnownNeverZero(SDValue Op) const { // If the value is a constant, we can obviously see if it is a zero or not. if (const ConstantFPSDNode *C = dyn_cast(Op)) return !C->isZero(); // TODO: Recognize more cases here. switch (Op.getOpcode()) { default: break; case ISD::OR: if (const ConstantSDNode *C = dyn_cast(Op.getOperand(1))) return !C->isNullValue(); break; } return false; } bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { // Check the obvious case. if (A == B) return true; // For for negative and positive zero. if (const ConstantFPSDNode *CA = dyn_cast(A)) if (const ConstantFPSDNode *CB = dyn_cast(B)) if (CA->isZero() && CB->isZero()) return true; // Otherwise they may not be equal. return false; } bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { assert(A.getValueType() == B.getValueType() && "Values must have the same type"); APInt AZero, AOne; APInt BZero, BOne; computeKnownBits(A, AZero, AOne); computeKnownBits(B, BZero, BOne); return (AZero | BZero).isAllOnesValue(); } static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef Ops, llvm::SelectionDAG &DAG) { if (Ops.size() == 1) return Ops[0]; // Concat of UNDEFs is UNDEF. if (std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) { return Op.isUndef(); })) return DAG.getUNDEF(VT); // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified // to one big BUILD_VECTOR. // FIXME: Add support for UNDEF and SCALAR_TO_VECTOR as well. if (!std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) { return Op.getOpcode() == ISD::BUILD_VECTOR; })) return SDValue(); EVT SVT = VT.getScalarType(); SmallVector Elts; for (SDValue Op : Ops) Elts.append(Op->op_begin(), Op->op_end()); // BUILD_VECTOR requires all inputs to be of the same type, find the // maximum type and extend them all. for (SDValue Op : Elts) SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); if (SVT.bitsGT(VT.getScalarType())) for (SDValue &Op : Elts) Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT) ? DAG.getZExtOrTrunc(Op, DL, SVT) : DAG.getSExtOrTrunc(Op, DL, SVT); return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts); } /// getNode - Gets or creates the specified node. /// SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, getVTList(VT), None); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), getVTList(VT)); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue Operand) { // Constant fold unary operations with an integer constant operand. Even // opaque constant will be folded, because the folding of unary operations // doesn't create new constants with different values. Nevertheless, the // opaque flag is preserved during folding to prevent future folding with // other constants. if (ConstantSDNode *C = dyn_cast(Operand)) { const APInt &Val = C->getAPIntValue(); switch (Opcode) { default: break; case ISD::SIGN_EXTEND: return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::TRUNCATE: return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: { APFloat apf(EVTToAPFloatSemantics(VT), APInt::getNullValue(VT.getSizeInBits())); (void)apf.convertFromAPInt(Val, Opcode==ISD::SINT_TO_FP, APFloat::rmNearestTiesToEven); return getConstantFP(apf, DL, VT); } case ISD::BITCAST: if (VT == MVT::f16 && C->getValueType(0) == MVT::i16) return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT); if (VT == MVT::f32 && C->getValueType(0) == MVT::i32) return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT); if (VT == MVT::f64 && C->getValueType(0) == MVT::i64) return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT); if (VT == MVT::f128 && C->getValueType(0) == MVT::i128) return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT); break; case ISD::BSWAP: return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTPOP: return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(), C->isOpaque()); } } // Constant fold unary operations with a floating point constant operand. if (ConstantFPSDNode *C = dyn_cast(Operand)) { APFloat V = C->getValueAPF(); // make copy switch (Opcode) { case ISD::FNEG: V.changeSign(); return getConstantFP(V, DL, VT); case ISD::FABS: V.clearSign(); return getConstantFP(V, DL, VT); case ISD::FCEIL: { APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive); if (fs == APFloat::opOK || fs == APFloat::opInexact) return getConstantFP(V, DL, VT); break; } case ISD::FTRUNC: { APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero); if (fs == APFloat::opOK || fs == APFloat::opInexact) return getConstantFP(V, DL, VT); break; } case ISD::FFLOOR: { APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative); if (fs == APFloat::opOK || fs == APFloat::opInexact) return getConstantFP(V, DL, VT); break; } case ISD::FP_EXTEND: { bool ignored; // This can return overflow, underflow, or inexact; we don't care. // FIXME need to be more flexible about rounding mode. (void)V.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven, &ignored); return getConstantFP(V, DL, VT); } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { integerPart x[2]; bool ignored; static_assert(integerPartWidth >= 64, "APFloat parts too small!"); // FIXME need to be more flexible about rounding mode. APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(), Opcode==ISD::FP_TO_SINT, APFloat::rmTowardZero, &ignored); if (s==APFloat::opInvalidOp) // inexact is OK, in fact usual break; APInt api(VT.getSizeInBits(), x); return getConstant(api, DL, VT); } case ISD::BITCAST: if (VT == MVT::i16 && C->getValueType(0) == MVT::f16) return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT); else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32) return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT); else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64) return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT); break; } } // Constant fold unary operations with a vector integer or float operand. if (BuildVectorSDNode *BV = dyn_cast(Operand)) { if (BV->isConstant()) { switch (Opcode) { default: // FIXME: Entirely reasonable to perform folding of other unary // operations here as the need arises. break; case ISD::FNEG: case ISD::FABS: case ISD::FCEIL: case ISD::FTRUNC: case ISD::FFLOOR: case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTPOP: { SDValue Ops = { Operand }; if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) return Fold; } } } } unsigned OpOpcode = Operand.getNode()->getOpcode(); switch (Opcode) { case ISD::TokenFactor: case ISD::MERGE_VALUES: case ISD::CONCAT_VECTORS: return Operand; // Factor, merge or concat of one node? No need. case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node"); case ISD::FP_EXTEND: assert(VT.isFloatingPoint() && Operand.getValueType().isFloatingPoint() && "Invalid FP cast!"); if (Operand.getValueType() == VT) return Operand; // noop conversion. assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); assert(Operand.getValueType().bitsLT(VT) && "Invalid fpext node, dst < src!"); if (Operand.getOpcode() == ISD::UNDEF) return getUNDEF(VT); break; case ISD::SIGN_EXTEND: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid SIGN_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); assert(Operand.getValueType().bitsLT(VT) && "Invalid sext node, dst < src!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); break; case ISD::ZERO_EXTEND: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid ZERO_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); assert(Operand.getValueType().bitsLT(VT) && "Invalid zext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); break; case ISD::ANY_EXTEND: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid ANY_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); assert(Operand.getValueType().bitsLT(VT) && "Invalid anyext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // (ext (trunx x)) -> x if (OpOpcode == ISD::TRUNCATE) { SDValue OpOp = Operand.getNode()->getOperand(0); if (OpOp.getValueType() == VT) return OpOp; } break; case ISD::TRUNCATE: assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid TRUNCATE!"); if (Operand.getValueType() == VT) return Operand; // noop truncate assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); assert(Operand.getValueType().bitsGT(VT) && "Invalid truncate node, src < dst!"); if (OpOpcode == ISD::TRUNCATE) return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) { // If the source is smaller than the dest, we still need an extend. if (Operand.getNode()->getOperand(0).getValueType().getScalarType() .bitsLT(VT.getScalarType())) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT)) return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); return Operand.getNode()->getOperand(0); } if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::BSWAP: assert(VT.isInteger() && VT == Operand.getValueType() && "Invalid BSWAP!"); assert((VT.getScalarSizeInBits() % 16 == 0) && "BSWAP types must be a multiple of 16 bits!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::BITCAST: // Basic sanity checking. assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits() && "Cannot BITCAST between types of different sizes!"); if (VT == Operand.getValueType()) return Operand; // noop conversion. if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0)); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::SCALAR_TO_VECTOR: assert(VT.isVector() && !Operand.getValueType().isVector() && (VT.getVectorElementType() == Operand.getValueType() || (VT.getVectorElementType().isInteger() && Operand.getValueType().isInteger() && VT.getVectorElementType().bitsLE(Operand.getValueType()))) && "Illegal SCALAR_TO_VECTOR node!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined. if (OpOpcode == ISD::EXTRACT_VECTOR_ELT && isa(Operand.getOperand(1)) && Operand.getConstantOperandVal(1) == 0 && Operand.getOperand(0).getValueType() == VT) return Operand.getOperand(0); break; case ISD::FNEG: // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags? return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1), Operand.getNode()->getOperand(0), &cast(Operand.getNode())->Flags); if (OpOpcode == ISD::FNEG) // --X -> X return Operand.getNode()->getOperand(0); break; case ISD::FABS: if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0)); break; } SDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { // Don't CSE flag producing nodes FoldingSetNodeID ID; SDValue Ops[1] = { Operand }; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) return SDValue(E, 0); N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand); } InsertNode(N); return SDValue(N, 0); } static std::pair FoldValue(unsigned Opcode, const APInt &C1, const APInt &C2) { switch (Opcode) { case ISD::ADD: return std::make_pair(C1 + C2, true); case ISD::SUB: return std::make_pair(C1 - C2, true); case ISD::MUL: return std::make_pair(C1 * C2, true); case ISD::AND: return std::make_pair(C1 & C2, true); case ISD::OR: return std::make_pair(C1 | C2, true); case ISD::XOR: return std::make_pair(C1 ^ C2, true); case ISD::SHL: return std::make_pair(C1 << C2, true); case ISD::SRL: return std::make_pair(C1.lshr(C2), true); case ISD::SRA: return std::make_pair(C1.ashr(C2), true); case ISD::ROTL: return std::make_pair(C1.rotl(C2), true); case ISD::ROTR: return std::make_pair(C1.rotr(C2), true); case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true); case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true); case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true); case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true); case ISD::UDIV: if (!C2.getBoolValue()) break; return std::make_pair(C1.udiv(C2), true); case ISD::UREM: if (!C2.getBoolValue()) break; return std::make_pair(C1.urem(C2), true); case ISD::SDIV: if (!C2.getBoolValue()) break; return std::make_pair(C1.sdiv(C2), true); case ISD::SREM: if (!C2.getBoolValue()) break; return std::make_pair(C1.srem(C2), true); } return std::make_pair(APInt(1, 0), false); } SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT, const ConstantSDNode *Cst1, const ConstantSDNode *Cst2) { if (Cst1->isOpaque() || Cst2->isOpaque()) return SDValue(); std::pair Folded = FoldValue(Opcode, Cst1->getAPIntValue(), Cst2->getAPIntValue()); if (!Folded.second) return SDValue(); return getConstant(Folded.first, DL, VT); } SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT, SDNode *Cst1, SDNode *Cst2) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); // Handle the case of two scalars. if (const ConstantSDNode *Scalar1 = dyn_cast(Cst1)) { if (const ConstantSDNode *Scalar2 = dyn_cast(Cst2)) { if (SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2)) { if (!VT.isVector()) return Folded; SmallVector Outputs; // We may have a vector type but a scalar result. Create a splat. Outputs.resize(VT.getVectorNumElements(), Outputs.back()); // Build a big vector out of the scalar elements we generated. return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs); } else { return SDValue(); } } } // For vectors extract each constant element into Inputs so we can constant // fold them individually. BuildVectorSDNode *BV1 = dyn_cast(Cst1); BuildVectorSDNode *BV2 = dyn_cast(Cst2); if (!BV1 || !BV2) return SDValue(); assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!"); EVT SVT = VT.getScalarType(); SmallVector Outputs; for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) { ConstantSDNode *V1 = dyn_cast(BV1->getOperand(I)); ConstantSDNode *V2 = dyn_cast(BV2->getOperand(I)); if (!V1 || !V2) // Not a constant, bail. return SDValue(); if (V1->isOpaque() || V2->isOpaque()) return SDValue(); // Avoid BUILD_VECTOR nodes that perform implicit truncation. // FIXME: This is valid and could be handled by truncating the APInts. if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT) return SDValue(); // Fold one vector element. std::pair Folded = FoldValue(Opcode, V1->getAPIntValue(), V2->getAPIntValue()); if (!Folded.second) return SDValue(); Outputs.push_back(getConstant(Folded.first, DL, SVT)); } assert(VT.getVectorNumElements() == Outputs.size() && "Vector size mismatch!"); // We may have a vector type but a scalar result. Create a splat. Outputs.resize(VT.getVectorNumElements(), Outputs.back()); // Build a big vector out of the scalar elements we generated. return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs); } SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops, const SDNodeFlags *Flags) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); // We can only fold vectors - maybe merge with FoldConstantArithmetic someday? if (!VT.isVector()) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); auto IsScalarOrSameVectorSize = [&](const SDValue &Op) { return !Op.getValueType().isVector() || Op.getValueType().getVectorNumElements() == NumElts; }; auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) { BuildVectorSDNode *BV = dyn_cast(Op); return (Op.getOpcode() == ISD::UNDEF) || (Op.getOpcode() == ISD::CONDCODE) || (BV && BV->isConstant()); }; // All operands must be vector types with the same number of elements as // the result type and must be either UNDEF or a build vector of constant // or UNDEF scalars. if (!std::all_of(Ops.begin(), Ops.end(), IsConstantBuildVectorOrUndef) || !std::all_of(Ops.begin(), Ops.end(), IsScalarOrSameVectorSize)) return SDValue(); // If we are comparing vectors, then the result needs to be a i1 boolean // that is then sign-extended back to the legal result type. EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType()); // Find legal integer scalar type for constant promotion and // ensure that its scalar size is at least as large as source. EVT LegalSVT = VT.getScalarType(); if (LegalSVT.isInteger()) { LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT); if (LegalSVT.bitsLT(SVT)) return SDValue(); } // Constant fold each scalar lane separately. SmallVector ScalarResults; for (unsigned i = 0; i != NumElts; i++) { SmallVector ScalarOps; for (SDValue Op : Ops) { EVT InSVT = Op.getValueType().getScalarType(); BuildVectorSDNode *InBV = dyn_cast(Op); if (!InBV) { // We've checked that this is UNDEF or a constant of some kind. if (Op.isUndef()) ScalarOps.push_back(getUNDEF(InSVT)); else ScalarOps.push_back(Op); continue; } SDValue ScalarOp = InBV->getOperand(i); EVT ScalarVT = ScalarOp.getValueType(); // Build vector (integer) scalar operands may need implicit // truncation - do this before constant folding. if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT)) ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp); ScalarOps.push_back(ScalarOp); } // Constant fold the scalar operands. SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags); // Legalize the (integer) scalar constant if necessary. if (LegalSVT != SVT) ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult); // Scalar folding only succeeded if the result is a constant or UNDEF. if (ScalarResult.getOpcode() != ISD::UNDEF && ScalarResult.getOpcode() != ISD::Constant && ScalarResult.getOpcode() != ISD::ConstantFP) return SDValue(); ScalarResults.push_back(ScalarResult); } assert(ScalarResults.size() == NumElts && "Unexpected number of scalar results for BUILD_VECTOR"); return getNode(ISD::BUILD_VECTOR, DL, VT, ScalarResults); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, const SDNodeFlags *Flags) { ConstantSDNode *N1C = dyn_cast(N1); ConstantSDNode *N2C = dyn_cast(N2); ConstantFPSDNode *N1CFP = dyn_cast(N1); ConstantFPSDNode *N2CFP = dyn_cast(N2); // Canonicalize constant to RHS if commutative. if (isCommutativeBinOp(Opcode)) { if (N1C && !N2C) { std::swap(N1C, N2C); std::swap(N1, N2); } else if (N1CFP && !N2CFP) { std::swap(N1CFP, N2CFP); std::swap(N1, N2); } } switch (Opcode) { default: break; case ISD::TokenFactor: assert(VT == MVT::Other && N1.getValueType() == MVT::Other && N2.getValueType() == MVT::Other && "Invalid token factor!"); // Fold trivial token factors. if (N1.getOpcode() == ISD::EntryToken) return N2; if (N2.getOpcode() == ISD::EntryToken) return N1; if (N1 == N2) return N1; break; case ISD::CONCAT_VECTORS: { // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. SDValue Ops[] = {N1, N2}; if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) return V; break; } case ISD::AND: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); // (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's // worth handling here. if (N2C && N2C->isNullValue()) return N2; if (N2C && N2C->isAllOnesValue()) // X & -1 -> X return N1; break; case ISD::OR: case ISD::XOR: case ISD::ADD: case ISD::SUB: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); // (X ^|+- 0) -> X. This commonly occurs when legalizing i64 values, so // it's worth handling here. if (N2C && N2C->isNullValue()) return N1; break; case ISD::UDIV: case ISD::UREM: case ISD::MULHU: case ISD::MULHS: case ISD::MUL: case ISD::SDIV: case ISD::SREM: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); break; case ISD::FADD: case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: if (getTarget().Options.UnsafeFPMath) { if (Opcode == ISD::FADD) { // x+0 --> x if (N2CFP && N2CFP->getValueAPF().isZero()) return N1; } else if (Opcode == ISD::FSUB) { // x-0 --> x if (N2CFP && N2CFP->getValueAPF().isZero()) return N1; } else if (Opcode == ISD::FMUL) { // x*0 --> 0 if (N2CFP && N2CFP->isZero()) return N2; // x*1 --> x if (N2CFP && N2CFP->isExactlyValue(1.0)) return N1; } } assert(VT.isFloatingPoint() && "This operator only applies to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); break; case ISD::FCOPYSIGN: // N1 and result must match. N1/N2 need not match. assert(N1.getValueType() == VT && N1.getValueType().isFloatingPoint() && N2.getValueType().isFloatingPoint() && "Invalid FCOPYSIGN!"); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: assert(VT == N1.getValueType() && "Shift operators return type must be the same as their first arg"); assert(VT.isInteger() && N2.getValueType().isInteger() && "Shifts only work on integers"); assert((!VT.isVector() || VT == N2.getValueType()) && "Vector shift amounts must be in the same as their first arg"); // Verify that the shift amount VT is bit enough to hold valid shift // amounts. This catches things like trying to shift an i1024 value by an // i8, which is easy to fall into in generic code that uses // TLI.getShiftAmount(). assert(N2.getValueType().getSizeInBits() >= Log2_32_Ceil(N1.getValueType().getSizeInBits()) && "Invalid use of small shift amount with oversized value!"); // Always fold shifts of i1 values so the code generator doesn't need to // handle them. Since we know the size of the shift has to be less than the // size of the value, the shift/rotate count is guaranteed to be zero. if (VT == MVT::i1) return N1; if (N2C && N2C->isNullValue()) return N1; break; case ISD::FP_ROUND_INREG: { EVT EVT = cast(N2)->getVT(); assert(VT == N1.getValueType() && "Not an inreg round!"); assert(VT.isFloatingPoint() && EVT.isFloatingPoint() && "Cannot FP_ROUND_INREG integer types"); assert(EVT.isVector() == VT.isVector() && "FP_ROUND_INREG type should be vector iff the operand " "type is vector!"); assert((!EVT.isVector() || EVT.getVectorNumElements() == VT.getVectorNumElements()) && "Vector element counts must match in FP_ROUND_INREG"); assert(EVT.bitsLE(VT) && "Not rounding down!"); (void)EVT; if (cast(N2)->getVT() == VT) return N1; // Not actually rounding. break; } case ISD::FP_ROUND: assert(VT.isFloatingPoint() && N1.getValueType().isFloatingPoint() && VT.bitsLE(N1.getValueType()) && N2C && "Invalid FP_ROUND!"); if (N1.getValueType() == VT) return N1; // noop conversion. break; case ISD::AssertSext: case ISD::AssertZext: { EVT EVT = cast(N2)->getVT(); assert(VT == N1.getValueType() && "Not an inreg extend!"); assert(VT.isInteger() && EVT.isInteger() && "Cannot *_EXTEND_INREG FP types"); assert(!EVT.isVector() && "AssertSExt/AssertZExt type should be the vector element type " "rather than the vector type!"); assert(EVT.bitsLE(VT) && "Not extending!"); if (VT == EVT) return N1; // noop assertion. break; } case ISD::SIGN_EXTEND_INREG: { EVT EVT = cast(N2)->getVT(); assert(VT == N1.getValueType() && "Not an inreg extend!"); assert(VT.isInteger() && EVT.isInteger() && "Cannot *_EXTEND_INREG FP types"); assert(EVT.isVector() == VT.isVector() && "SIGN_EXTEND_INREG type should be vector iff the operand " "type is vector!"); assert((!EVT.isVector() || EVT.getVectorNumElements() == VT.getVectorNumElements()) && "Vector element counts must match in SIGN_EXTEND_INREG"); assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending auto SignExtendInReg = [&](APInt Val) { unsigned FromBits = EVT.getScalarType().getSizeInBits(); Val <<= Val.getBitWidth() - FromBits; Val = Val.ashr(Val.getBitWidth() - FromBits); return getConstant(Val, DL, VT.getScalarType()); }; if (N1C) { APInt Val = N1C->getAPIntValue(); return SignExtendInReg(Val); } if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { SmallVector Ops; for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { SDValue Op = N1.getOperand(i); if (Op.getOpcode() == ISD::UNDEF) { Ops.push_back(getUNDEF(VT.getScalarType())); continue; } if (ConstantSDNode *C = dyn_cast(Op)) { APInt Val = C->getAPIntValue(); Val = Val.zextOrTrunc(VT.getScalarSizeInBits()); Ops.push_back(SignExtendInReg(Val)); continue; } break; } if (Ops.size() == VT.getVectorNumElements()) return getNode(ISD::BUILD_VECTOR, DL, VT, Ops); } break; } case ISD::EXTRACT_VECTOR_ELT: // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF. if (N1.getOpcode() == ISD::UNDEF) return getUNDEF(VT); // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF if (N2C && N2C->getZExtValue() >= N1.getValueType().getVectorNumElements()) return getUNDEF(VT); // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is // expanding copies of large vectors from registers. if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) { unsigned Factor = N1.getOperand(0).getValueType().getVectorNumElements(); return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(N2C->getZExtValue() / Factor), getConstant(N2C->getZExtValue() % Factor, DL, N2.getValueType())); } // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is // expanding large vector constants. if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) { SDValue Elt = N1.getOperand(N2C->getZExtValue()); if (VT != Elt.getValueType()) // If the vector element type is not legal, the BUILD_VECTOR operands // are promoted and implicitly truncated, and the result implicitly // extended. Make that explicit here. Elt = getAnyExtOrTrunc(Elt, DL, VT); return Elt; } // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector // operations are lowered to scalars. if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) { // If the indices are the same, return the inserted element else // if the indices are known different, extract the element from // the original vector. SDValue N1Op2 = N1.getOperand(2); ConstantSDNode *N1Op2C = dyn_cast(N1Op2); if (N1Op2C && N2C) { if (N1Op2C->getZExtValue() == N2C->getZExtValue()) { if (VT == N1.getOperand(1).getValueType()) return N1.getOperand(1); else return getSExtOrTrunc(N1.getOperand(1), DL, VT); } return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2); } } break; case ISD::EXTRACT_ELEMENT: assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!"); assert(!N1.getValueType().isVector() && !VT.isVector() && (N1.getValueType().isInteger() == VT.isInteger()) && N1.getValueType() != VT && "Wrong types for EXTRACT_ELEMENT!"); // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding // 64-bit integers into 32-bit parts. Instead of building the extract of // the BUILD_PAIR, only to have legalize rip it apart, just do it now. if (N1.getOpcode() == ISD::BUILD_PAIR) return N1.getOperand(N2C->getZExtValue()); // EXTRACT_ELEMENT of a constant int is also very common. if (N1C) { unsigned ElementSize = VT.getSizeInBits(); unsigned Shift = ElementSize * N2C->getZExtValue(); APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift); return getConstant(ShiftedVal.trunc(ElementSize), DL, VT); } break; case ISD::EXTRACT_SUBVECTOR: if (VT.isSimple() && N1.getValueType().isSimple()) { assert(VT.isVector() && N1.getValueType().isVector() && "Extract subvector VTs must be a vectors!"); assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType() && "Extract subvector VTs must have the same element type!"); assert(VT.getSimpleVT() <= N1.getSimpleValueType() && "Extract subvector must be from larger vector to smaller vector!"); if (N2C) { assert((VT.getVectorNumElements() + N2C->getZExtValue() <= N1.getValueType().getVectorNumElements()) && "Extract subvector overflow!"); } // Trivial extraction. if (VT.getSimpleVT() == N1.getSimpleValueType()) return N1; } break; } // Perform trivial constant folding. if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode())) return SV; // Constant fold FP operations. bool HasFPExceptions = TLI->hasFloatingPointExceptions(); if (N1CFP) { if (N2CFP) { APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF(); APFloat::opStatus s; switch (Opcode) { case ISD::FADD: s = V1.add(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || s != APFloat::opInvalidOp) return getConstantFP(V1, DL, VT); break; case ISD::FSUB: s = V1.subtract(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || s!=APFloat::opInvalidOp) return getConstantFP(V1, DL, VT); break; case ISD::FMUL: s = V1.multiply(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || s!=APFloat::opInvalidOp) return getConstantFP(V1, DL, VT); break; case ISD::FDIV: s = V1.divide(V2, APFloat::rmNearestTiesToEven); if (!HasFPExceptions || (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)) { return getConstantFP(V1, DL, VT); } break; case ISD::FREM : s = V1.mod(V2); if (!HasFPExceptions || (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)) { return getConstantFP(V1, DL, VT); } break; case ISD::FCOPYSIGN: V1.copySign(V2); return getConstantFP(V1, DL, VT); default: break; } } if (Opcode == ISD::FP_ROUND) { APFloat V = N1CFP->getValueAPF(); // make copy bool ignored; // This can return overflow, underflow, or inexact; we don't care. // FIXME need to be more flexible about rounding mode. (void)V.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven, &ignored); return getConstantFP(V, DL, VT); } } // Canonicalize an UNDEF to the RHS, even over a constant. if (N1.getOpcode() == ISD::UNDEF) { if (isCommutativeBinOp(Opcode)) { std::swap(N1, N2); } else { switch (Opcode) { case ISD::FP_ROUND_INREG: case ISD::SIGN_EXTEND_INREG: case ISD::SUB: case ISD::FSUB: case ISD::FDIV: case ISD::FREM: case ISD::SRA: return N1; // fold op(undef, arg2) -> undef case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: case ISD::SRL: case ISD::SHL: if (!VT.isVector()) return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0 // For vectors, we can't easily build an all zero vector, just return // the LHS. return N2; } } } // Fold a bunch of operators when the RHS is undef. if (N2.getOpcode() == ISD::UNDEF) { switch (Opcode) { case ISD::XOR: if (N1.getOpcode() == ISD::UNDEF) // Handle undef ^ undef -> 0 special case. This is a common // idiom (misuse). return getConstant(0, DL, VT); // fallthrough case ISD::ADD: case ISD::ADDC: case ISD::ADDE: case ISD::SUB: case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: return N2; // fold op(arg1, undef) -> undef case ISD::FADD: case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: if (getTarget().Options.UnsafeFPMath) return N2; break; case ISD::MUL: case ISD::AND: case ISD::SRL: case ISD::SHL: if (!VT.isVector()) return getConstant(0, DL, VT); // fold op(arg1, undef) -> 0 // For vectors, we can't easily build an all zero vector, just return // the LHS. return N1; case ISD::OR: if (!VT.isVector()) return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); // For vectors, we can't easily build an all one vector, just return // the LHS. return N1; case ISD::SRA: return N1; } } // Memoize this node if possible. BinarySDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { SDValue Ops[] = {N1, N2}; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); - AddNodeIDFlags(ID, Opcode, Flags); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) { + if (Flags) + E->intersectFlagsWith(Flags); return SDValue(E, 0); + } N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); CSEMap.InsertNode(N, IP); } else { N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3) { // Perform various simplifications. switch (Opcode) { case ISD::FMA: { ConstantFPSDNode *N1CFP = dyn_cast(N1); ConstantFPSDNode *N2CFP = dyn_cast(N2); ConstantFPSDNode *N3CFP = dyn_cast(N3); if (N1CFP && N2CFP && N3CFP) { APFloat V1 = N1CFP->getValueAPF(); const APFloat &V2 = N2CFP->getValueAPF(); const APFloat &V3 = N3CFP->getValueAPF(); APFloat::opStatus s = V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven); if (!TLI->hasFloatingPointExceptions() || s != APFloat::opInvalidOp) return getConstantFP(V1, DL, VT); } break; } case ISD::CONCAT_VECTORS: { // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. SDValue Ops[] = {N1, N2, N3}; if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) return V; break; } case ISD::SETCC: { // Use FoldSetCC to simplify SETCC's. if (SDValue V = FoldSetCC(VT, N1, N2, cast(N3)->get(), DL)) return V; // Vector constant folding. SDValue Ops[] = {N1, N2, N3}; if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) return V; break; } case ISD::SELECT: if (ConstantSDNode *N1C = dyn_cast(N1)) { if (N1C->getZExtValue()) return N2; // select true, X, Y -> X return N3; // select false, X, Y -> Y } if (N2 == N3) return N2; // select C, X, X -> X break; case ISD::VECTOR_SHUFFLE: llvm_unreachable("should use getVectorShuffle constructor!"); case ISD::INSERT_SUBVECTOR: { SDValue Index = N3; if (VT.isSimple() && N1.getValueType().isSimple() && N2.getValueType().isSimple()) { assert(VT.isVector() && N1.getValueType().isVector() && N2.getValueType().isVector() && "Insert subvector VTs must be a vectors"); assert(VT == N1.getValueType() && "Dest and insert subvector source types must match!"); assert(N2.getSimpleValueType() <= N1.getSimpleValueType() && "Insert subvector must be from smaller vector to larger vector!"); if (isa(Index)) { assert((N2.getValueType().getVectorNumElements() + cast(Index)->getZExtValue() <= VT.getVectorNumElements()) && "Insert subvector overflow!"); } // Trivial insertion. if (VT.getSimpleVT() == N2.getSimpleValueType()) return N2; } break; } case ISD::BITCAST: // Fold bit_convert nodes from a type to themselves. if (N1.getValueType() == VT) return N1; break; } // Memoize node if it doesn't produce a flag. SDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { SDValue Ops[] = { N1, N2, N3 }; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) return SDValue(E, 0); N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4) { SDValue Ops[] = { N1, N2, N3, N4 }; return getNode(Opcode, DL, VT, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3, SDValue N4, SDValue N5) { SDValue Ops[] = { N1, N2, N3, N4, N5 }; return getNode(Opcode, DL, VT, Ops); } /// getStackArgumentTokenFactor - Compute a TokenFactor to force all /// the incoming stack arguments to be loaded from the stack. SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) { SmallVector ArgChains; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the // CALLSEQ_BEGIN node. ArgChains.push_back(Chain); // Add a chain value for each stack argument. for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(), UE = getEntryNode().getNode()->use_end(); U != UE; ++U) if (LoadSDNode *L = dyn_cast(*U)) if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) if (FI->getIndex() < 0) ArgChains.push_back(SDValue(L, 1)); // Build a tokenfactor for all the chains. return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); } /// getMemsetValue - Vectorized representation of the memset value /// operand. static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(Value.getOpcode() != ISD::UNDEF); unsigned NumBits = VT.getScalarType().getSizeInBits(); if (ConstantSDNode *C = dyn_cast(Value)) { assert(C->getAPIntValue().getBitWidth() == 8); APInt Val = APInt::getSplat(NumBits, C->getAPIntValue()); if (VT.isInteger()) return DAG.getConstant(Val, dl, VT); return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl, VT); } assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?"); EVT IntVT = VT.getScalarType(); if (!IntVT.isInteger()) IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits()); Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value); if (NumBits > 8) { // Use a multiplication with 0x010101... to extend the input to the // required length. APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); Value = DAG.getNode(ISD::MUL, dl, IntVT, Value, DAG.getConstant(Magic, dl, IntVT)); } if (VT != Value.getValueType() && !VT.isInteger()) Value = DAG.getNode(ISD::BITCAST, dl, VT.getScalarType(), Value); if (VT != Value.getValueType()) { assert(VT.getVectorElementType() == Value.getValueType() && "value type should be one vector element here"); SmallVector BVOps(VT.getVectorNumElements(), Value); Value = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BVOps); } return Value; } /// getMemsetStringVal - Similar to getMemsetValue. Except this is only /// used when a memcpy is turned into a memset when the source is a constant /// string ptr. static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG, const TargetLowering &TLI, StringRef Str) { // Handle vector with all elements zero. if (Str.empty()) { if (VT.isInteger()) return DAG.getConstant(0, dl, VT); else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128) return DAG.getConstantFP(0.0, dl, VT); else if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getConstant(0, dl, EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts))); } else llvm_unreachable("Expected type!"); } assert(!VT.isVector() && "Can't handle vector type here!"); unsigned NumVTBits = VT.getSizeInBits(); unsigned NumVTBytes = NumVTBits / 8; unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size())); APInt Val(NumVTBits, 0); if (DAG.getDataLayout().isLittleEndian()) { for (unsigned i = 0; i != NumBytes; ++i) Val |= (uint64_t)(unsigned char)Str[i] << i*8; } else { for (unsigned i = 0; i != NumBytes; ++i) Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8; } // If the "cost" of materializing the integer immediate is less than the cost // of a load, then it is cost effective to turn the load into the immediate. Type *Ty = VT.getTypeForEVT(*DAG.getContext()); if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty)) return DAG.getConstant(Val, dl, VT); return SDValue(nullptr, 0); } /// getMemBasePlusOffset - Returns base and offset node for the /// static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, SDLoc dl, SelectionDAG &DAG) { EVT VT = Base.getValueType(); return DAG.getNode(ISD::ADD, dl, VT, Base, DAG.getConstant(Offset, dl, VT)); } /// isMemSrcFromString - Returns true if memcpy source is a string constant. /// static bool isMemSrcFromString(SDValue Src, StringRef &Str) { unsigned SrcDelta = 0; GlobalAddressSDNode *G = nullptr; if (Src.getOpcode() == ISD::GlobalAddress) G = cast(Src); else if (Src.getOpcode() == ISD::ADD && Src.getOperand(0).getOpcode() == ISD::GlobalAddress && Src.getOperand(1).getOpcode() == ISD::Constant) { G = cast(Src.getOperand(0)); SrcDelta = cast(Src.getOperand(1))->getZExtValue(); } if (!G) return false; return getConstantStringInfo(G->getGlobal(), Str, SrcDelta, false); } /// Determines the optimal series of memory ops to replace the memset / memcpy. /// Return true if the number of memory ops is below the threshold (Limit). /// It returns the types of the sequence of memory ops to perform /// memset / memcpy by reference. static bool FindOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, bool AllowOverlap, SelectionDAG &DAG, const TargetLowering &TLI) { assert((SrcAlign == 0 || SrcAlign >= DstAlign) && "Expecting memcpy / memset source to meet alignment requirement!"); // If 'SrcAlign' is zero, that means the memory operation does not need to // load the value, i.e. memset or memcpy from constant string. Otherwise, // it's the inferred alignment of the source. 'DstAlign', on the other hand, // is the specified alignment of the memory operation. If it is zero, that // means it's possible to change the alignment of the destination. // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does // not need to be loaded. EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign, IsMemset, ZeroMemset, MemcpyStrSrc, DAG.getMachineFunction()); if (VT == MVT::Other) { unsigned AS = 0; if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(AS) || TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) { VT = TLI.getPointerTy(DAG.getDataLayout()); } else { switch (DstAlign & 7) { case 0: VT = MVT::i64; break; case 4: VT = MVT::i32; break; case 2: VT = MVT::i16; break; default: VT = MVT::i8; break; } } MVT LVT = MVT::i64; while (!TLI.isTypeLegal(LVT)) LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1); assert(LVT.isInteger()); if (VT.bitsGT(LVT)) VT = LVT; } unsigned NumMemOps = 0; while (Size != 0) { unsigned VTSize = VT.getSizeInBits() / 8; while (VTSize > Size) { // For now, only use non-vector load / store's for the left-over pieces. EVT NewVT = VT; unsigned NewVTSize; bool Found = false; if (VT.isVector() || VT.isFloatingPoint()) { NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32; if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) && TLI.isSafeMemOpType(NewVT.getSimpleVT())) Found = true; else if (NewVT == MVT::i64 && TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) && TLI.isSafeMemOpType(MVT::f64)) { // i64 is usually not legal on 32-bit targets, but f64 may be. NewVT = MVT::f64; Found = true; } } if (!Found) { do { NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1); if (NewVT == MVT::i8) break; } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT())); } NewVTSize = NewVT.getSizeInBits() / 8; // If the new VT cannot cover all of the remaining bits, then consider // issuing a (or a pair of) unaligned and overlapping load / store. // FIXME: Only does this for 64-bit or more since we don't have proper // cost model for unaligned load / store. bool Fast; unsigned AS = 0; if (NumMemOps && AllowOverlap && VTSize >= 8 && NewVTSize < Size && TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast) VTSize = Size; else { VT = NewVT; VTSize = NewVTSize; } } if (++NumMemOps > Limit) return false; MemOps.push_back(VT); Size -= VTSize; } return true; } static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { // On Darwin, -Os means optimize for size without hurting performance, so // only really optimize for size when -Oz (MinSize) is used. if (MF.getTarget().getTargetTriple().isOSDarwin()) return MF.getFunction()->optForMinSize(); return MF.getFunction()->optForSize(); } static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { // Turn a memcpy of undef to nop. if (Src.getOpcode() == ISD::UNDEF) return Chain; // Expand memcpy to a series of load and store ops if the size operand falls // below a certain threshold. // TODO: In the AlwaysInline case, if the size is big then generate a loop // rather than maybe a humongous number of loads and stores. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); bool OptSize = shouldLowerMemFuncForSize(MF); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; StringRef Str; bool CopyFromStr = isMemSrcFromString(Src, Str); bool isZeroStr = CopyFromStr && Str.empty(); unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), (isZeroStr ? 0 : SrcAlign), false, false, CopyFromStr, true, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty); // Don't promote to an alignment that would require dynamic stack // realignment. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->needsStackRealignment(MF)) while (NewAlign > Align && DAG.getDataLayout().exceedsNaturalStackAlignment(NewAlign)) NewAlign /= 2; if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) MFI->setObjectAlignment(FI->getIndex(), NewAlign); Align = NewAlign; } } SmallVector OutChains; unsigned NumMemOps = MemOps.size(); uint64_t SrcOff = 0, DstOff = 0; for (unsigned i = 0; i != NumMemOps; ++i) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value, Store; if (VTSize > Size) { // Issuing an unaligned load / store pair that overlaps with the previous // pair. Adjust the offset accordingly. assert(i == NumMemOps-1 && i != 0); SrcOff -= VTSize - Size; DstOff -= VTSize - Size; } if (CopyFromStr && (isZeroStr || (VT.isInteger() && !VT.isVector()))) { // It's unlikely a store of a vector immediate can be done in a single // instruction. It would require a load from a constantpool first. // We only handle zero vectors here. // FIXME: Handle other cases where store of vector immediate is done in // a single instruction. Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff)); if (Value.getNode()) Store = DAG.getStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), isVol, false, Align); } if (!Store.getNode()) { // The type might not be legal for the target. This should only happen // if the type is smaller than a legal type, as on PPC, so the right // thing to do is generate a LoadExt/StoreTrunc pair. These simplify // to Load/Store if NVT==VT. // FIXME does the case above also need this? EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); assert(NVT.bitsGE(VT)); Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain, getMemBasePlusOffset(Src, SrcOff, dl, DAG), SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false, false, MinAlign(SrcAlign, SrcOff)); Store = DAG.getTruncStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), VT, isVol, false, Align); } OutChains.push_back(Store); SrcOff += VTSize; DstOff += VTSize; Size -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { // Turn a memmove of undef to nop. if (Src.getOpcode() == ISD::UNDEF) return Chain; // Expand memmove to a series of load and store ops if the size operand falls // below a certain threshold. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); bool OptSize = shouldLowerMemFuncForSize(MF); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), SrcAlign, false, false, false, false, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty); if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) MFI->setObjectAlignment(FI->getIndex(), NewAlign); Align = NewAlign; } } uint64_t SrcOff = 0, DstOff = 0; SmallVector LoadValues; SmallVector LoadChains; SmallVector OutChains; unsigned NumMemOps = MemOps.size(); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value; Value = DAG.getLoad(VT, dl, Chain, getMemBasePlusOffset(Src, SrcOff, dl, DAG), SrcPtrInfo.getWithOffset(SrcOff), isVol, false, false, SrcAlign); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); SrcOff += VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); OutChains.clear(); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; SDValue Store; Store = DAG.getStore(Chain, dl, LoadValues[i], getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), isVol, false, Align); OutChains.push_back(Store); DstOff += VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } /// \brief Lower the call to 'memset' intrinsic function into a series of store /// operations. /// /// \param DAG Selection DAG where lowered code is placed. /// \param dl Link to corresponding IR location. /// \param Chain Control flow dependency. /// \param Dst Pointer to destination memory location. /// \param Src Value of byte to write into the memory. /// \param Size Number of bytes to write. /// \param Align Alignment of the destination in bytes. /// \param isVol True if destination is volatile. /// \param DstPtrInfo IR information on the memory pointer. /// \returns New head in the control flow, if lowering was successful, empty /// SDValue otherwise. /// /// The function tries to replace 'llvm.memset' intrinsic with several store /// operations and value calculation code. This is usually profitable for small /// memory size. static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo) { // Turn a memset of undef to nop. if (Src.getOpcode() == ISD::UNDEF) return Chain; // Expand memset to a series of load/store ops if the size operand // falls below a certain threshold. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); bool OptSize = shouldLowerMemFuncForSize(MF); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; bool IsZeroVal = isa(Src) && cast(Src)->isNullValue(); if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize), Size, (DstAlignCanChange ? 0 : Align), 0, true, IsZeroVal, false, true, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty); if (NewAlign > Align) { // Give the stack frame object a larger alignment if needed. if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) MFI->setObjectAlignment(FI->getIndex(), NewAlign); Align = NewAlign; } } SmallVector OutChains; uint64_t DstOff = 0; unsigned NumMemOps = MemOps.size(); // Find the largest store and generate the bit pattern for it. EVT LargestVT = MemOps[0]; for (unsigned i = 1; i < NumMemOps; i++) if (MemOps[i].bitsGT(LargestVT)) LargestVT = MemOps[i]; SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; if (VTSize > Size) { // Issuing an unaligned load / store pair that overlaps with the previous // pair. Adjust the offset accordingly. assert(i == NumMemOps-1 && i != 0); DstOff -= VTSize - Size; } // If this store is smaller than the largest store see whether we can get // the smaller value for free with a truncate. SDValue Value = MemSetValue; if (VT.bitsLT(LargestVT)) { if (!LargestVT.isVector() && !VT.isVector() && TLI.isTruncateFree(LargestVT, VT)) Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue); else Value = getMemsetValue(Src, VT, DAG, dl); } assert(Value.getValueType() == VT && "Value with wrong type."); SDValue Store = DAG.getStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, dl, DAG), DstPtrInfo.getWithOffset(DstOff), isVol, false, Align); OutChains.push_back(Store); DstOff += VT.getSizeInBits() / 8; Size -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI, unsigned AS) { // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all // pointer operands can be losslessly bitcasted to pointers of address space 0 if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) { report_fatal_error("cannot lower memory intrinsic in address space " + Twine(AS)); } } SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { assert(Align && "The SDAG layer expects explicit alignment and reserves 0"); // Check to see if we should lower the memcpy to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); if (ConstantSize) { // Memcpy with size zero? Just return the original chain. if (ConstantSize->isNullValue()) return Chain; SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),Align, isVol, false, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; } // Then check to see if we should lower the memcpy with target-specific // code. If the target chooses to do this, this is the next best. if (TSI) { SDValue Result = TSI->EmitTargetCodeForMemcpy( *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; } // If we really need inline code and the target declined to provide it, // use a (potentially long) sequence of loads and stores. if (AlwaysInline) { assert(ConstantSize && "AlwaysInline requires a constant size!"); return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Align, isVol, true, DstPtrInfo, SrcPtrInfo); } checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace()); // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc // memcpy is not guaranteed to be safe. libc memcpys aren't required to // respect volatile, so they may do things like read or write memory // beyond the given memory regions. But fixing this isn't easy, and most // people don't care. // Emit a library call. TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Ty = getDataLayout().getIntPtrType(*getContext()); Entry.Node = Dst; Args.push_back(Entry); Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), TLI->getPointerTy(getDataLayout())), std::move(Args), 0) .setDiscardResult() .setTailCall(isTailCall); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; } SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { assert(Align && "The SDAG layer expects explicit alignment and reserves 0"); // Check to see if we should lower the memmove to loads and stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); if (ConstantSize) { // Memmove with size zero? Just return the original chain. if (ConstantSize->isNullValue()) return Chain; SDValue Result = getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Align, isVol, false, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; } // Then check to see if we should lower the memmove with target-specific // code. If the target chooses to do this, this is the next best. if (TSI) { SDValue Result = TSI->EmitTargetCodeForMemmove( *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo); if (Result.getNode()) return Result; } checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace()); // FIXME: If the memmove is volatile, lowering it to plain libc memmove may // not be safe. See memcpy above for more details. // Emit a library call. TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Ty = getDataLayout().getIntPtrType(*getContext()); Entry.Node = Dst; Args.push_back(Entry); Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), TLI->getPointerTy(getDataLayout())), std::move(Args), 0) .setDiscardResult() .setTailCall(isTailCall); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; } SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool isTailCall, MachinePointerInfo DstPtrInfo) { assert(Align && "The SDAG layer expects explicit alignment and reserves 0"); // Check to see if we should lower the memset to stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); if (ConstantSize) { // Memset with size zero? Just return the original chain. if (ConstantSize->isNullValue()) return Chain; SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Align, isVol, DstPtrInfo); if (Result.getNode()) return Result; } // Then check to see if we should lower the memset with target-specific // code. If the target chooses to do this, this is the next best. if (TSI) { SDValue Result = TSI->EmitTargetCodeForMemset( *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo); if (Result.getNode()) return Result; } checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); // Emit a library call. Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; Entry.Ty = IntPtrTy; Args.push_back(Entry); Entry.Node = Src; Entry.Ty = Src.getValueType().getTypeForEVT(*getContext()); Args.push_back(Entry); Entry.Node = Size; Entry.Ty = IntPtrTy; Args.push_back(Entry); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), Type::getVoidTy(*getContext()), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), TLI->getPointerTy(getDataLayout())), std::move(Args), 0) .setDiscardResult() .setTailCall(isTailCall); std::pair CallResult = TLI->LowerCallTo(CLI); return CallResult.second; } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList, ArrayRef Ops, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { FoldingSetNodeID ID; ID.AddInteger(MemVT.getRawBits()); AddNodeIDNode(ID, Opcode, VTList, Ops); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void* IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } // Allocate the operands array for the node out of the BumpPtrAllocator, since // SDNode doesn't have access to it. This memory will be "leaked" when // the node is deallocated, but recovered when the allocator is released. // If the number of operands is less than 5 we use AtomicSDNode's internal // storage. unsigned NumOps = Ops.size(); SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate(NumOps) : nullptr; SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, MemVT, Ops.data(), DynOps, NumOps, MMO, SuccessOrdering, FailureOrdering, SynchScope); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList, ArrayRef Ops, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) { return getAtomic(Opcode, dl, MemVT, VTList, Ops, MMO, Ordering, Ordering, SynchScope); } SDValue SelectionDAG::getAtomicCmpSwap( unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo, unsigned Alignment, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { assert(Opcode == ISD::ATOMIC_CMP_SWAP || Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(MemVT); MachineFunction &MF = getMachineFunction(); // FIXME: Volatile isn't really correct; we should keep track of atomic // orderings in the memoperand. unsigned Flags = MachineMemOperand::MOVolatile; Flags |= MachineMemOperand::MOLoad; Flags |= MachineMemOperand::MOStore; MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment); return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO, SuccessOrdering, FailureOrdering, SynchScope); } SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachineMemOperand *MMO, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) { assert(Opcode == ISD::ATOMIC_CMP_SWAP || Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types"); SDValue Ops[] = {Chain, Ptr, Cmp, Swp}; return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, SuccessOrdering, FailureOrdering, SynchScope); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, const Value* PtrVal, unsigned Alignment, AtomicOrdering Ordering, SynchronizationScope SynchScope) { if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(MemVT); MachineFunction &MF = getMachineFunction(); // An atomic store does not load. An atomic load does not store. // (An atomicrmw obviously both loads and stores.) // For now, atomics are considered to be volatile always, and they are // chained as such. // FIXME: Volatile isn't really correct; we should keep track of atomic // orderings in the memoperand. unsigned Flags = MachineMemOperand::MOVolatile; if (Opcode != ISD::ATOMIC_STORE) Flags |= MachineMemOperand::MOLoad; if (Opcode != ISD::ATOMIC_LOAD) Flags |= MachineMemOperand::MOStore; MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags, MemVT.getStoreSize(), Alignment); return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO, Ordering, SynchScope); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) { assert((Opcode == ISD::ATOMIC_LOAD_ADD || Opcode == ISD::ATOMIC_LOAD_SUB || Opcode == ISD::ATOMIC_LOAD_AND || Opcode == ISD::ATOMIC_LOAD_OR || Opcode == ISD::ATOMIC_LOAD_XOR || Opcode == ISD::ATOMIC_LOAD_NAND || Opcode == ISD::ATOMIC_LOAD_MIN || Opcode == ISD::ATOMIC_LOAD_MAX || Opcode == ISD::ATOMIC_LOAD_UMIN || Opcode == ISD::ATOMIC_LOAD_UMAX || Opcode == ISD::ATOMIC_SWAP || Opcode == ISD::ATOMIC_STORE) && "Invalid Atomic Op"); EVT VT = Val.getValueType(); SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) : getVTList(VT, MVT::Other); SDValue Ops[] = {Chain, Ptr, Val}; return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope); } SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO, AtomicOrdering Ordering, SynchronizationScope SynchScope) { assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op"); SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = {Chain, Ptr}; return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope); } /// getMergeValues - Create a MERGE_VALUES node from the given operands. SDValue SelectionDAG::getMergeValues(ArrayRef Ops, SDLoc dl) { if (Ops.size() == 1) return Ops[0]; SmallVector VTs; VTs.reserve(Ops.size()); for (unsigned i = 0; i < Ops.size(); ++i) VTs.push_back(Ops[i].getValueType()); return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops); } SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align, bool Vol, bool ReadMem, bool WriteMem, unsigned Size) { if (Align == 0) // Ensure that codegen never sees alignment 0 Align = getEVTAlignment(MemVT); MachineFunction &MF = getMachineFunction(); unsigned Flags = 0; if (WriteMem) Flags |= MachineMemOperand::MOStore; if (ReadMem) Flags |= MachineMemOperand::MOLoad; if (Vol) Flags |= MachineMemOperand::MOVolatile; if (!Size) Size = MemVT.getStoreSize(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, Size, Align); return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO); } SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachineMemOperand *MMO) { assert((Opcode == ISD::INTRINSIC_VOID || Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::PREFETCH || Opcode == ISD::LIFETIME_START || Opcode == ISD::LIFETIME_END || (Opcode <= INT_MAX && (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) && "Opcode is not a memory-accessing opcode!"); // Memoize the node unless it returns a flag. MemIntrinsicSDNode *N; if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, MemVT, MMO); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, MemVT, MMO); } InsertNode(N); return SDValue(N, 0); } /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a /// MachinePointerInfo record from it. This is particularly useful because the /// code generator has many cases where it doesn't bother passing in a /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst". static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr, int64_t Offset = 0) { // If this is FI+Offset, we can model it. if (const FrameIndexSDNode *FI = dyn_cast(Ptr)) return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI->getIndex(), Offset); // If this is (FI+Offset1)+Offset2, we can model it. if (Ptr.getOpcode() != ISD::ADD || !isa(Ptr.getOperand(1)) || !isa(Ptr.getOperand(0))) return MachinePointerInfo(); int FI = cast(Ptr.getOperand(0))->getIndex(); return MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FI, Offset + cast(Ptr.getOperand(1))->getSExtValue()); } /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a /// MachinePointerInfo record from it. This is particularly useful because the /// code generator has many cases where it doesn't bother passing in a /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst". static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr, SDValue OffsetOp) { // If the 'Offset' value isn't a constant, we can't handle this. if (ConstantSDNode *OffsetNode = dyn_cast(OffsetOp)) return InferPointerInfo(DAG, Ptr, OffsetNode->getSExtValue()); if (OffsetOp.getOpcode() == ISD::UNDEF) return InferPointerInfo(DAG, Ptr); return MachinePointerInfo(); } SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Offset, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(VT); unsigned Flags = MachineMemOperand::MOLoad; if (isVolatile) Flags |= MachineMemOperand::MOVolatile; if (isNonTemporal) Flags |= MachineMemOperand::MONonTemporal; if (isInvariant) Flags |= MachineMemOperand::MOInvariant; // If we don't have a PtrInfo, infer the trivial frame index case to simplify // clients. if (PtrInfo.V.isNull()) PtrInfo = InferPointerInfo(*this, Ptr, Offset); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges); return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO); } SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Offset, EVT MemVT, MachineMemOperand *MMO) { if (VT == MemVT) { ExtType = ISD::NON_EXTLOAD; } else if (ExtType == ISD::NON_EXTLOAD) { assert(VT == MemVT && "Non-extending load from different memory type!"); } else { // Extending load. assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) && "Should only be an extending load, not truncating!"); assert(VT.isInteger() == MemVT.isInteger() && "Cannot convert from FP to Int or Int -> FP!"); assert(VT.isVector() == MemVT.isVector() && "Cannot use an ext load to convert to or from a vector!"); assert((!VT.isVector() || VT.getVectorNumElements() == MemVT.getVectorNumElements()) && "Cannot use an ext load to change the number of vector elements!"); } bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.getOpcode() == ISD::UNDEF) && "Unindexed load with an offset!"); SDVTList VTs = Indexed ? getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, Ptr, Offset }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::LOAD, VTs, Ops); ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ExtType, MemVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef, PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo, Ranges); } SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef, VT, MMO); } SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const AAMDNodes &AAInfo) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo, MemVT, isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo); } SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO) { SDValue Undef = getUNDEF(Ptr.getValueType()); return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, MemVT, MMO); } SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM) { LoadSDNode *LD = cast(OrigLoad); assert(LD->getOffset().getOpcode() == ISD::UNDEF && "Load is already a indexed load!"); return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl, LD->getChain(), Base, Offset, LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(), false, LD->getAlignment()); } SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, unsigned Alignment, const AAMDNodes &AAInfo) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(Val.getValueType()); unsigned Flags = MachineMemOperand::MOStore; if (isVolatile) Flags |= MachineMemOperand::MOVolatile; if (isNonTemporal) Flags |= MachineMemOperand::MONonTemporal; if (PtrInfo.V.isNull()) PtrInfo = InferPointerInfo(*this, Ptr); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, Val.getValueType().getStoreSize(), Alignment, AAInfo); return getStore(Chain, dl, Val, Ptr, MMO); } SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachineMemOperand *MMO) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); EVT VT = Val.getValueType(); SDVTList VTs = getVTList(MVT::Other); SDValue Undef = getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::STORE, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED, false, VT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT,bool isVolatile, bool isNonTemporal, unsigned Alignment, const AAMDNodes &AAInfo) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = getEVTAlignment(SVT); unsigned Flags = MachineMemOperand::MOStore; if (isVolatile) Flags |= MachineMemOperand::MOVolatile; if (isNonTemporal) Flags |= MachineMemOperand::MONonTemporal; if (PtrInfo.V.isNull()) PtrInfo = InferPointerInfo(*this, Ptr); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, Flags, SVT.getStoreSize(), Alignment, AAInfo); return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO); } SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, EVT SVT, MachineMemOperand *MMO) { EVT VT = Val.getValueType(); assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (VT == SVT) return getStore(Chain, dl, Val, Ptr, MMO); assert(SVT.getScalarType().bitsLT(VT.getScalarType()) && "Should only be a truncating store, not extending!"); assert(VT.isInteger() == SVT.isInteger() && "Can't do FP-INT conversion!"); assert(VT.isVector() == SVT.isVector() && "Cannot use trunc store to convert to or from a vector!"); assert((!VT.isVector() || VT.getVectorNumElements() == SVT.getVectorNumElements()) && "Cannot use trunc store to change the number of vector elements!"); SDVTList VTs = getVTList(MVT::Other); SDValue Undef = getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::STORE, VTs, Ops); ID.AddInteger(SVT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED, true, SVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM) { StoreSDNode *ST = cast(OrigStore); assert(ST->getOffset().getOpcode() == ISD::UNDEF && "Store is already a indexed store!"); SDVTList VTs = getVTList(Base.getValueType(), MVT::Other); SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::STORE, VTs, Ops); ID.AddInteger(ST->getMemoryVT().getRawBits()); ID.AddInteger(ST->getRawSubclassData()); ID.AddInteger(ST->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) return SDValue(E, 0); SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ST->isTruncatingStore(), ST->getMemoryVT(), ST->getMemOperand()); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::LoadExtType ExtTy) { SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Src0 }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, 4, VTs, ExtTy, MemVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, bool isTrunc) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); EVT VT = Val.getValueType(); SDVTList VTs = getVTList(MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Val }; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, 4, VTs, isTrunc, MemVT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, SDLoc dl, ArrayRef Ops, MachineMemOperand *MMO) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } MaskedGatherSDNode *N = new (NodeAllocator) MaskedGatherSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, VTs, VT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, SDLoc dl, ArrayRef Ops, MachineMemOperand *MMO) { FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant())); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } SDNode *N = new (NodeAllocator) MaskedScatterSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, VTs, VT, MMO); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue SV, unsigned Align) { SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) }; return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops) { switch (Ops.size()) { case 0: return getNode(Opcode, DL, VT); case 1: return getNode(Opcode, DL, VT, static_cast(Ops[0])); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; } // Copy from an SDUse array into an SDValue array for use with // the regular getNode logic. SmallVector NewOps(Ops.begin(), Ops.end()); return getNode(Opcode, DL, VT, NewOps); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef Ops, const SDNodeFlags *Flags) { unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); case 1: return getNode(Opcode, DL, VT, Ops[0]); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; } switch (Opcode) { default: break; case ISD::CONCAT_VECTORS: { // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) return V; break; } case ISD::SELECT_CC: { assert(NumOps == 5 && "SELECT_CC takes 5 operands!"); assert(Ops[0].getValueType() == Ops[1].getValueType() && "LHS and RHS of condition must have same type!"); assert(Ops[2].getValueType() == Ops[3].getValueType() && "True and False arms of SelectCC must have same type!"); assert(Ops[2].getValueType() == VT && "select_cc node must be of same type as true and false value!"); break; } case ISD::BR_CC: { assert(NumOps == 5 && "BR_CC takes 5 operands!"); assert(Ops[2].getValueType() == Ops[3].getValueType() && "LHS/RHS of comparison should match types!"); break; } } // Memoize nodes. SDNode *N; SDVTList VTs = getVTList(VT); if (VT != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) return SDValue(E, 0); N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops); CSEMap.InsertNode(N, IP); } else { N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops); } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, ArrayRef ResultTys, ArrayRef Ops) { return getNode(Opcode, DL, getVTList(ResultTys), Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, ArrayRef Ops) { if (VTList.NumVTs == 1) return getNode(Opcode, DL, VTList.VTs[0], Ops); #if 0 switch (Opcode) { // FIXME: figure out how to safely handle things like // int foo(int x) { return 1 << (x & 255); } // int bar() { return foo(256); } case ISD::SRA_PARTS: case ISD::SRL_PARTS: case ISD::SHL_PARTS: if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG && cast(N3.getOperand(1))->getVT() != MVT::i1) return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); else if (N3.getOpcode() == ISD::AND) if (ConstantSDNode *AndRHS = dyn_cast(N3.getOperand(1))) { // If the and is only masking out bits that cannot effect the shift, // eliminate the and. unsigned NumBits = VT.getScalarType().getSizeInBits()*2; if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1) return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); } break; } #endif // Memoize the node unless it returns a flag. SDNode *N; unsigned NumOps = Ops.size(); if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) return SDValue(E, 0); if (NumOps == 1) { N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]); } else if (NumOps == 2) { N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]); } else if (NumOps == 3) { N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1], Ops[2]); } else { N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops); } CSEMap.InsertNode(N, IP); } else { if (NumOps == 1) { N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]); } else if (NumOps == 2) { N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]); } else if (NumOps == 3) { N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1], Ops[2]); } else { N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops); } } InsertNode(N); return SDValue(N, 0); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) { return getNode(Opcode, DL, VTList, None); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1) { SDValue Ops[] = { N1 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2) { SDValue Ops[] = { N1, N2 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2, SDValue N3) { SDValue Ops[] = { N1, N2, N3 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2, SDValue N3, SDValue N4) { SDValue Ops[] = { N1, N2, N3, N4 }; return getNode(Opcode, DL, VTList, Ops); } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList, SDValue N1, SDValue N2, SDValue N3, SDValue N4, SDValue N5) { SDValue Ops[] = { N1, N2, N3, N4, N5 }; return getNode(Opcode, DL, VTList, Ops); } SDVTList SelectionDAG::getVTList(EVT VT) { return makeVTList(SDNode::getValueTypeList(VT), 1); } SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) { FoldingSetNodeID ID; ID.AddInteger(2U); ID.AddInteger(VT1.getRawBits()); ID.AddInteger(VT2.getRawBits()); void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(2); Array[0] = VT1; Array[1] = VT2; Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) { FoldingSetNodeID ID; ID.AddInteger(3U); ID.AddInteger(VT1.getRawBits()); ID.AddInteger(VT2.getRawBits()); ID.AddInteger(VT3.getRawBits()); void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(3); Array[0] = VT1; Array[1] = VT2; Array[2] = VT3; Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) { FoldingSetNodeID ID; ID.AddInteger(4U); ID.AddInteger(VT1.getRawBits()); ID.AddInteger(VT2.getRawBits()); ID.AddInteger(VT3.getRawBits()); ID.AddInteger(VT4.getRawBits()); void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(4); Array[0] = VT1; Array[1] = VT2; Array[2] = VT3; Array[3] = VT4; Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } SDVTList SelectionDAG::getVTList(ArrayRef VTs) { unsigned NumVTs = VTs.size(); FoldingSetNodeID ID; ID.AddInteger(NumVTs); for (unsigned index = 0; index < NumVTs; index++) { ID.AddInteger(VTs[index].getRawBits()); } void *IP = nullptr; SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(NumVTs); std::copy(VTs.begin(), VTs.end(), Array); Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs); VTListMap.InsertNode(Result, IP); } return Result->getSDVTList(); } /// UpdateNodeOperands - *Mutate* the specified node in-place to have the /// specified operands. If the resultant node already exists in the DAG, /// this does not modify the specified node, instead it returns the node that /// already exists. If the resultant node does not exist in the DAG, the /// input node is returned. As a degenerate case, if you specify the same /// input operands as the node already has, the input node is returned. SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) { assert(N->getNumOperands() == 1 && "Update with wrong number of operands"); // Check to see if there is no change. if (Op == N->getOperand(0)) return N; // See if the modified node already exists. void *InsertPos = nullptr; if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos)) return Existing; // Nope it doesn't. Remove the node from its current place in the maps. if (InsertPos) if (!RemoveNodeFromCSEMaps(N)) InsertPos = nullptr; // Now we update the operands. N->OperandList[0].set(Op); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; } SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) { assert(N->getNumOperands() == 2 && "Update with wrong number of operands"); // Check to see if there is no change. if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1)) return N; // No operands changed, just return the input node. // See if the modified node already exists. void *InsertPos = nullptr; if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos)) return Existing; // Nope it doesn't. Remove the node from its current place in the maps. if (InsertPos) if (!RemoveNodeFromCSEMaps(N)) InsertPos = nullptr; // Now we update the operands. if (N->OperandList[0] != Op1) N->OperandList[0].set(Op1); if (N->OperandList[1] != Op2) N->OperandList[1].set(Op2); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) { SDValue Ops[] = { Op1, Op2, Op3 }; return UpdateNodeOperands(N, Ops); } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3, SDValue Op4) { SDValue Ops[] = { Op1, Op2, Op3, Op4 }; return UpdateNodeOperands(N, Ops); } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3, SDValue Op4, SDValue Op5) { SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 }; return UpdateNodeOperands(N, Ops); } SDNode *SelectionDAG:: UpdateNodeOperands(SDNode *N, ArrayRef Ops) { unsigned NumOps = Ops.size(); assert(N->getNumOperands() == NumOps && "Update with wrong number of operands"); // If no operands changed just return the input node. if (std::equal(Ops.begin(), Ops.end(), N->op_begin())) return N; // See if the modified node already exists. void *InsertPos = nullptr; if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos)) return Existing; // Nope it doesn't. Remove the node from its current place in the maps. if (InsertPos) if (!RemoveNodeFromCSEMaps(N)) InsertPos = nullptr; // Now we update the operands. for (unsigned i = 0; i != NumOps; ++i) if (N->OperandList[i] != Ops[i]) N->OperandList[i].set(Ops[i]); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; } /// DropOperands - Release the operands and set this node to have /// zero operands. void SDNode::DropOperands() { // Unlike the code in MorphNodeTo that does this, we don't need to // watch for dead nodes here. for (op_iterator I = op_begin(), E = op_end(); I != E; ) { SDUse &Use = *I++; Use.set(SDValue()); } } /// SelectNodeTo - These are wrappers around MorphNodeTo that accept a /// machine opcode. /// SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT) { SDVTList VTs = getVTList(VT); return SelectNodeTo(N, MachineOpc, VTs, None); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2, Op3 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, ArrayRef Ops) { SDVTList VTs = getVTList(VT); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2) { SDVTList VTs = getVTList(VT1, VT2); return SelectNodeTo(N, MachineOpc, VTs, None); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, EVT VT4, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3, VT4); return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, SDValue Op1) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2, Op3 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2, VT3); SDValue Ops[] = { Op1, Op2, Op3 }; return SelectNodeTo(N, MachineOpc, VTs, Ops); } SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc, SDVTList VTs,ArrayRef Ops) { N = MorphNodeTo(N, ~MachineOpc, VTs, Ops); // Reset the NodeID to -1. N->setNodeId(-1); return N; } /// UpdadeSDLocOnMergedSDNode - If the opt level is -O0 then it throws away /// the line number information on the merged node since it is not possible to /// preserve the information that operation is associated with multiple lines. /// This will make the debugger working better at -O0, were there is a higher /// probability having other instructions associated with that line. /// /// For IROrder, we keep the smaller of the two SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) { DebugLoc NLoc = N->getDebugLoc(); if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) { N->setDebugLoc(DebugLoc()); } unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder()); N->setIROrder(Order); return N; } /// MorphNodeTo - This *mutates* the specified node to have the specified /// return type, opcode, and operands. /// /// Note that MorphNodeTo returns the resultant node. If there is already a /// node of the specified opcode and operands, it returns that node instead of /// the current one. Note that the SDLoc need not be the same. /// /// Using MorphNodeTo is faster than creating a new node and swapping it in /// with ReplaceAllUsesWith both because it often avoids allocating a new /// node, and because it doesn't require CSE recalculation for any of /// the node's users. /// /// However, note that MorphNodeTo recursively deletes dead nodes from the DAG. /// As a consequence it isn't appropriate to use from within the DAG combiner or /// the legalizer which maintain worklists that would need to be updated when /// deleting things. SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef Ops) { unsigned NumOps = Ops.size(); // If an identical node already exists, use it. void *IP = nullptr; if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, VTs, Ops); if (SDNode *ON = FindNodeOrInsertPos(ID, N->getDebugLoc(), IP)) return UpdadeSDLocOnMergedSDNode(ON, SDLoc(N)); } if (!RemoveNodeFromCSEMaps(N)) IP = nullptr; // Start the morphing. N->NodeType = Opc; N->ValueList = VTs.VTs; N->NumValues = VTs.NumVTs; // Clear the operands list, updating used nodes to remove this from their // use list. Keep track of any operands that become dead as a result. SmallPtrSet DeadNodeSet; for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) { SDUse &Use = *I++; SDNode *Used = Use.getNode(); Use.set(SDValue()); if (Used->use_empty()) DeadNodeSet.insert(Used); } if (MachineSDNode *MN = dyn_cast(N)) { // Initialize the memory references information. MN->setMemRefs(nullptr, nullptr); // If NumOps is larger than the # of operands we can have in a // MachineSDNode, reallocate the operand list. if (NumOps > MN->NumOperands || !MN->OperandsNeedDelete) { if (MN->OperandsNeedDelete) delete[] MN->OperandList; if (NumOps > array_lengthof(MN->LocalOperands)) // We're creating a final node that will live unmorphed for the // remainder of the current SelectionDAG iteration, so we can allocate // the operands directly out of a pool with no recycling metadata. MN->InitOperands(OperandAllocator.Allocate(NumOps), Ops.data(), NumOps); else MN->InitOperands(MN->LocalOperands, Ops.data(), NumOps); MN->OperandsNeedDelete = false; } else MN->InitOperands(MN->OperandList, Ops.data(), NumOps); } else { // If NumOps is larger than the # of operands we currently have, reallocate // the operand list. if (NumOps > N->NumOperands) { if (N->OperandsNeedDelete) delete[] N->OperandList; N->InitOperands(new SDUse[NumOps], Ops.data(), NumOps); N->OperandsNeedDelete = true; } else N->InitOperands(N->OperandList, Ops.data(), NumOps); } // Delete any nodes that are still dead after adding the uses for the // new operands. if (!DeadNodeSet.empty()) { SmallVector DeadNodes; for (SDNode *N : DeadNodeSet) if (N->use_empty()) DeadNodes.push_back(N); RemoveDeadNodes(DeadNodes); } if (IP) CSEMap.InsertNode(N, IP); // Memoize the new node. return N; } /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. /// /// Note that getMachineNode returns the resultant node. If there is already a /// node of the specified opcode and operands, it returns that node instead of /// the current one. MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT) { SDVTList VTs = getVTList(VT); return getMachineNode(Opcode, dl, VTs, None); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT); SDValue Ops[] = { Op1, Op2, Op3 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, ArrayRef Ops) { SDVTList VTs = getVTList(VT); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2) { SDVTList VTs = getVTList(VT1, VT2); return getMachineNode(Opcode, dl, VTs, None); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2); SDValue Ops[] = { Op1, Op2, Op3 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2) { SDVTList VTs = getVTList(VT1, VT2, VT3); SDValue Ops[] = { Op1, Op2 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3) { SDVTList VTs = getVTList(VT1, VT2, VT3); SDValue Ops[] = { Op1, Op2, Op3 }; return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2, EVT VT3, EVT VT4, ArrayRef Ops) { SDVTList VTs = getVTList(VT1, VT2, VT3, VT4); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, ArrayRef ResultTys, ArrayRef Ops) { SDVTList VTs = getVTList(ResultTys); return getMachineNode(Opcode, dl, VTs, Ops); } MachineSDNode * SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs, ArrayRef OpsArray) { bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue; MachineSDNode *N; void *IP = nullptr; const SDValue *Ops = OpsArray.data(); unsigned NumOps = OpsArray.size(); if (DoCSE) { FoldingSetNodeID ID; AddNodeIDNode(ID, ~Opcode, VTs, OpsArray); IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) { return cast(UpdadeSDLocOnMergedSDNode(E, DL)); } } // Allocate a new MachineSDNode. N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); // Initialize the operands list. if (NumOps > array_lengthof(N->LocalOperands)) // We're creating a final node that will live unmorphed for the // remainder of the current SelectionDAG iteration, so we can allocate // the operands directly out of a pool with no recycling metadata. N->InitOperands(OperandAllocator.Allocate(NumOps), Ops, NumOps); else N->InitOperands(N->LocalOperands, Ops, NumOps); N->OperandsNeedDelete = false; if (DoCSE) CSEMap.InsertNode(N, IP); InsertNode(N); return N; } /// getTargetExtractSubreg - A convenience function for creating /// TargetOpcode::EXTRACT_SUBREG nodes. SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, SDLoc DL, EVT VT, SDValue Operand) { SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32); SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, Operand, SRIdxVal); return SDValue(Subreg, 0); } /// getTargetInsertSubreg - A convenience function for creating /// TargetOpcode::INSERT_SUBREG nodes. SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT, SDValue Operand, SDValue Subreg) { SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32); SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, Operand, Subreg, SRIdxVal); return SDValue(Result, 0); } /// getNodeIfExists - Get the specified node if it's already available, or /// else return NULL. SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef Ops, const SDNodeFlags *Flags) { if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); - AddNodeIDFlags(ID, Opcode, Flags); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) { + if (Flags) + E->intersectFlagsWith(Flags); return E; + } } return nullptr; } /// getDbgValue - Creates a SDDbgValue node. /// /// SDNode SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool IsIndirect, uint64_t Off, DebugLoc DL, unsigned O) { assert(cast(Var)->isValidLocationForIntrinsic(DL) && "Expected inlined-at fields to agree"); return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, N, R, IsIndirect, Off, DL, O); } /// Constant SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t Off, DebugLoc DL, unsigned O) { assert(cast(Var)->isValidLocationForIntrinsic(DL) && "Expected inlined-at fields to agree"); return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, Off, DL, O); } /// FrameIndex SDDbgValue *SelectionDAG::getFrameIndexDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t Off, DebugLoc DL, unsigned O) { assert(cast(Var)->isValidLocationForIntrinsic(DL) && "Expected inlined-at fields to agree"); return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, FI, Off, DL, O); } namespace { /// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node /// pointed to by a use iterator is deleted, increment the use iterator /// so that it doesn't dangle. /// class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener { SDNode::use_iterator &UI; SDNode::use_iterator &UE; void NodeDeleted(SDNode *N, SDNode *E) override { // Increment the iterator as needed. while (UI != UE && N == *UI) ++UI; } public: RAUWUpdateListener(SelectionDAG &d, SDNode::use_iterator &ui, SDNode::use_iterator &ue) : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {} }; } /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. /// /// This version assumes From has a single result value. /// void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) { SDNode *From = FromN.getNode(); assert(From->getNumValues() == 1 && FromN.getResNo() == 0 && "Cannot replace with this method!"); assert(From != To.getNode() && "Cannot replace uses of with self"); // Iterate over all the existing uses of From. New uses will be added // to the beginning of the use list, which we avoid visiting. // This specifically avoids visiting uses of From that arise while the // replacement is happening, because any such uses would be the result // of CSE: If an existing node looks like From after one of its operands // is replaced by To, we don't want to replace of all its users with To // too. See PR3018 for more info. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); ++UI; Use.set(To); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (FromN == getRoot()) setRoot(To); } /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. /// /// This version assumes that for each value of From, there is a /// corresponding value in To in the same position with the same type. /// void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) { #ifndef NDEBUG for (unsigned i = 0, e = From->getNumValues(); i != e; ++i) assert((!From->hasAnyUseOfValue(i) || From->getValueType(i) == To->getValueType(i)) && "Cannot use this version of ReplaceAllUsesWith!"); #endif // Handle the trivial case. if (From == To) return; // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); ++UI; Use.setNode(To); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (From == getRoot().getNode()) setRoot(SDValue(To, getRoot().getResNo())); } /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. /// /// This version can replace From with any result values. To must match the /// number and types of values returned by From. void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) { if (From->getNumValues() == 1) // Handle the simple case efficiently. return ReplaceAllUsesWith(SDValue(From, 0), To[0]); // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); const SDValue &ToOp = To[Use.getResNo()]; ++UI; Use.set(ToOp); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (From == getRoot().getNode()) setRoot(SDValue(To[getRoot().getResNo()])); } /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The Deleted /// vector is handled the same way as for ReplaceAllUsesWith. void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){ // Handle the really simple, really trivial case efficiently. if (From == To) return; // Handle the simple, trivial, case efficiently. if (From.getNode()->getNumValues() == 1) { ReplaceAllUsesWith(From, To); return; } // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From.getNode()->use_begin(), UE = From.getNode()->use_end(); RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; bool UserRemovedFromCSEMaps = false; // A user can appear in a use list multiple times, and when this // happens the uses are usually next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { SDUse &Use = UI.getUse(); // Skip uses of different values from the same node. if (Use.getResNo() != From.getResNo()) { ++UI; continue; } // If this node hasn't been modified yet, it's still in the CSE maps, // so remove its old self from the CSE maps. if (!UserRemovedFromCSEMaps) { RemoveNodeFromCSEMaps(User); UserRemovedFromCSEMaps = true; } ++UI; Use.set(To); } while (UI != UE && *UI == User); // We are iterating over all uses of the From node, so if a use // doesn't use the specific value, no changes are made. if (!UserRemovedFromCSEMaps) continue; // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. if (From == getRoot()) setRoot(To); } namespace { /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith /// to record information about a use. struct UseMemo { SDNode *User; unsigned Index; SDUse *Use; }; /// operator< - Sort Memos by User. bool operator<(const UseMemo &L, const UseMemo &R) { return (intptr_t)L.User < (intptr_t)R.User; } } /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The same value /// may appear in both the From and To list. The Deleted vector is /// handled the same way as for ReplaceAllUsesWith. void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To, unsigned Num){ // Handle the simple, trivial case efficiently. if (Num == 1) return ReplaceAllUsesOfValueWith(*From, *To); // Read up all the uses and make records of them. This helps // processing new uses that are introduced during the // replacement process. SmallVector Uses; for (unsigned i = 0; i != Num; ++i) { unsigned FromResNo = From[i].getResNo(); SDNode *FromNode = From[i].getNode(); for (SDNode::use_iterator UI = FromNode->use_begin(), E = FromNode->use_end(); UI != E; ++UI) { SDUse &Use = UI.getUse(); if (Use.getResNo() == FromResNo) { UseMemo Memo = { *UI, i, &Use }; Uses.push_back(Memo); } } } // Sort the uses, so that all the uses from a given User are together. std::sort(Uses.begin(), Uses.end()); for (unsigned UseIndex = 0, UseIndexEnd = Uses.size(); UseIndex != UseIndexEnd; ) { // We know that this user uses some value of From. If it is the right // value, update it. SDNode *User = Uses[UseIndex].User; // This node is about to morph, remove its old self from the CSE maps. RemoveNodeFromCSEMaps(User); // The Uses array is sorted, so all the uses for a given User // are next to each other in the list. // To help reduce the number of CSE recomputations, process all // the uses of this user that we can find this way. do { unsigned i = Uses[UseIndex].Index; SDUse &Use = *Uses[UseIndex].Use; ++UseIndex; Use.set(To[i]); } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); } } /// AssignTopologicalOrder - Assign a unique node id for each node in the DAG /// based on their topological order. It returns the maximum id and a vector /// of the SDNodes* in assigned order by reference. unsigned SelectionDAG::AssignTopologicalOrder() { unsigned DAGSize = 0; // SortedPos tracks the progress of the algorithm. Nodes before it are // sorted, nodes after it are unsorted. When the algorithm completes // it is at the end of the list. allnodes_iterator SortedPos = allnodes_begin(); // Visit all the nodes. Move nodes with no operands to the front of // the list immediately. Annotate nodes that do have operands with their // operand count. Before we do this, the Node Id fields of the nodes // may contain arbitrary values. After, the Node Id fields for nodes // before SortedPos will contain the topological sort index, and the // Node Id fields for nodes At SortedPos and after will contain the // count of outstanding operands. for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) { SDNode *N = &*I++; checkForCycles(N, this); unsigned Degree = N->getNumOperands(); if (Degree == 0) { // A node with no uses, add it to the result array immediately. N->setNodeId(DAGSize++); allnodes_iterator Q(N); if (Q != SortedPos) SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q)); assert(SortedPos != AllNodes.end() && "Overran node list"); ++SortedPos; } else { // Temporarily use the Node Id as scratch space for the degree count. N->setNodeId(Degree); } } // Visit all the nodes. As we iterate, move nodes into sorted order, // such that by the time the end is reached all nodes will be sorted. for (SDNode &Node : allnodes()) { SDNode *N = &Node; checkForCycles(N, this); // N is in sorted position, so all its uses have one less operand // that needs to be sorted. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDNode *P = *UI; unsigned Degree = P->getNodeId(); assert(Degree != 0 && "Invalid node degree"); --Degree; if (Degree == 0) { // All of P's operands are sorted, so P may sorted now. P->setNodeId(DAGSize++); if (P != SortedPos) SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P)); assert(SortedPos != AllNodes.end() && "Overran node list"); ++SortedPos; } else { // Update P's outstanding operand count. P->setNodeId(Degree); } } if (&Node == SortedPos) { #ifndef NDEBUG allnodes_iterator I(N); SDNode *S = &*++I; dbgs() << "Overran sorted position:\n"; S->dumprFull(this); dbgs() << "\n"; dbgs() << "Checking if this is due to cycles\n"; checkForCycles(this, true); #endif llvm_unreachable(nullptr); } } assert(SortedPos == AllNodes.end() && "Topological sort incomplete!"); assert(AllNodes.front().getOpcode() == ISD::EntryToken && "First node in topological sort is not the entry token!"); assert(AllNodes.front().getNodeId() == 0 && "First node in topological sort has non-zero id!"); assert(AllNodes.front().getNumOperands() == 0 && "First node in topological sort has operands!"); assert(AllNodes.back().getNodeId() == (int)DAGSize-1 && "Last node in topologic sort has unexpected id!"); assert(AllNodes.back().use_empty() && "Last node in topologic sort has users!"); assert(DAGSize == allnodes_size() && "Node count mismatch!"); return DAGSize; } /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the /// value is produced by SD. void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) { if (SD) { assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue()); SD->setHasDebugValue(true); } DbgInfo->add(DB, SD, isParameter); } /// TransferDbgValues - Transfer SDDbgValues. void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) { if (From == To || !From.getNode()->getHasDebugValue()) return; SDNode *FromNode = From.getNode(); SDNode *ToNode = To.getNode(); ArrayRef DVs = GetDbgValues(FromNode); SmallVector ClonedDVs; for (ArrayRef::iterator I = DVs.begin(), E = DVs.end(); I != E; ++I) { SDDbgValue *Dbg = *I; if (Dbg->getKind() == SDDbgValue::SDNODE) { SDDbgValue *Clone = getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode, To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(), Dbg->getDebugLoc(), Dbg->getOrder()); ClonedDVs.push_back(Clone); } } for (SmallVectorImpl::iterator I = ClonedDVs.begin(), E = ClonedDVs.end(); I != E; ++I) AddDbgValue(*I, ToNode, false); } //===----------------------------------------------------------------------===// // SDNode Class //===----------------------------------------------------------------------===// bool llvm::isNullConstant(SDValue V) { ConstantSDNode *Const = dyn_cast(V); return Const != nullptr && Const->isNullValue(); } bool llvm::isNullFPConstant(SDValue V) { ConstantFPSDNode *Const = dyn_cast(V); return Const != nullptr && Const->isZero() && !Const->isNegative(); } bool llvm::isAllOnesConstant(SDValue V) { ConstantSDNode *Const = dyn_cast(V); return Const != nullptr && Const->isAllOnesValue(); } bool llvm::isOneConstant(SDValue V) { ConstantSDNode *Const = dyn_cast(V); return Const != nullptr && Const->isOne(); } HandleSDNode::~HandleSDNode() { DropOperands(); } GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order, DebugLoc DL, const GlobalValue *GA, EVT VT, int64_t o, unsigned char TF) : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) { TheGlobal = GA; } AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT, SDValue X, unsigned SrcAS, unsigned DestAS) : UnarySDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT), X), SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {} MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, EVT memvt, MachineMemOperand *mmo) : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) { SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant()); assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!"); assert(isNonTemporal() == MMO->isNonTemporal() && "Non-temporal encoding error!"); // We check here that the size of the memory operand fits within the size of // the MMO. This is because the MMO might indicate only a possible address // range instead of specifying the affected memory addresses precisely. assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!"); } MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, ArrayRef Ops, EVT memvt, MachineMemOperand *mmo) : SDNode(Opc, Order, dl, VTs, Ops), MemoryVT(memvt), MMO(mmo) { SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(), MMO->isNonTemporal(), MMO->isInvariant()); assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!"); assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!"); } /// Profile - Gather unique data for the node. /// void SDNode::Profile(FoldingSetNodeID &ID) const { AddNodeIDNode(ID, this); } namespace { struct EVTArray { std::vector VTs; EVTArray() { VTs.reserve(MVT::LAST_VALUETYPE); for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i) VTs.push_back(MVT((MVT::SimpleValueType)i)); } }; } static ManagedStatic > EVTs; static ManagedStatic SimpleVTArray; static ManagedStatic > VTMutex; /// getValueTypeList - Return a pointer to the specified value type. /// const EVT *SDNode::getValueTypeList(EVT VT) { if (VT.isExtended()) { sys::SmartScopedLock Lock(*VTMutex); return &(*EVTs->insert(VT).first); } else { assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE && "Value type out of range!"); return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy]; } } /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the /// indicated value. This method ignores uses of other values defined by this /// operation. bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const { assert(Value < getNumValues() && "Bad value!"); // TODO: Only iterate over uses of a given value of the node for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) { if (UI.getUse().getResNo() == Value) { if (NUses == 0) return false; --NUses; } } // Found exactly the right number of uses? return NUses == 0; } /// hasAnyUseOfValue - Return true if there are any use of the indicated /// value. This method ignores uses of other values defined by this operation. bool SDNode::hasAnyUseOfValue(unsigned Value) const { assert(Value < getNumValues() && "Bad value!"); for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) if (UI.getUse().getResNo() == Value) return true; return false; } /// isOnlyUserOf - Return true if this node is the only use of N. /// bool SDNode::isOnlyUserOf(const SDNode *N) const { bool Seen = false; for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { SDNode *User = *I; if (User == this) Seen = true; else return false; } return Seen; } /// isOperand - Return true if this node is an operand of N. /// bool SDValue::isOperandOf(const SDNode *N) const { for (const SDValue &Op : N->op_values()) if (*this == Op) return true; return false; } bool SDNode::isOperandOf(const SDNode *N) const { for (const SDValue &Op : N->op_values()) if (this == Op.getNode()) return true; return false; } /// reachesChainWithoutSideEffects - Return true if this operand (which must /// be a chain) reaches the specified operand without crossing any /// side-effecting instructions on any chain path. In practice, this looks /// through token factors and non-volatile loads. In order to remain efficient, /// this only looks a couple of nodes in, it does not do an exhaustive search. bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, unsigned Depth) const { if (*this == Dest) return true; // Don't search too deeply, we just want to be able to see through // TokenFactor's etc. if (Depth == 0) return false; // If this is a token factor, all inputs to the TF happen in parallel. If any // of the operands of the TF does not reach dest, then we cannot do the xform. if (getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = getNumOperands(); i != e; ++i) if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1)) return false; return true; } // Loads don't have side effects, look through them. if (LoadSDNode *Ld = dyn_cast(*this)) { if (!Ld->isVolatile()) return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1); } return false; } /// hasPredecessor - Return true if N is a predecessor of this node. /// N is either an operand of this node, or can be reached by recursively /// traversing up the operands. /// NOTE: This is an expensive method. Use it carefully. bool SDNode::hasPredecessor(const SDNode *N) const { SmallPtrSet Visited; SmallVector Worklist; return hasPredecessorHelper(N, Visited, Worklist); } bool SDNode::hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallVectorImpl &Worklist) const { if (Visited.empty()) { Worklist.push_back(this); } else { // Take a look in the visited set. If we've already encountered this node // we needn't search further. if (Visited.count(N)) return true; } // Haven't visited N yet. Continue the search. while (!Worklist.empty()) { const SDNode *M = Worklist.pop_back_val(); for (const SDValue &OpV : M->op_values()) { SDNode *Op = OpV.getNode(); if (Visited.insert(Op).second) Worklist.push_back(Op); if (Op == N) return true; } } return false; } uint64_t SDNode::getConstantOperandVal(unsigned Num) const { assert(Num < NumOperands && "Invalid child # of SDNode!"); return cast(OperandList[Num])->getZExtValue(); } const SDNodeFlags *SDNode::getFlags() const { if (auto *FlagsNode = dyn_cast(this)) return &FlagsNode->Flags; return nullptr; +} + +void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) { + if (auto *FlagsNode = dyn_cast(this)) + FlagsNode->Flags.intersectWith(Flags); } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { assert(N->getNumValues() == 1 && "Can't unroll a vector with multiple results!"); EVT VT = N->getValueType(0); unsigned NE = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); SmallVector Scalars; SmallVector Operands(N->getNumOperands()); // If ResNE is 0, fully unroll the vector op. if (ResNE == 0) ResNE = NE; else if (NE > ResNE) NE = ResNE; unsigned i; for (i= 0; i != NE; ++i) { for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) { SDValue Operand = N->getOperand(j); EVT OperandVT = Operand.getValueType(); if (OperandVT.isVector()) { // A vector operand; extract a single element. EVT OperandEltVT = OperandVT.getVectorElementType(); Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand, getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout()))); } else { // A scalar operand; just use it as is. Operands[j] = Operand; } } switch (N->getOpcode()) { default: { Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands, N->getFlags())); break; } case ISD::VSELECT: Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands)); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0], getShiftAmountOperand(Operands[0].getValueType(), Operands[1]))); break; case ISD::SIGN_EXTEND_INREG: case ISD::FP_ROUND_INREG: { EVT ExtVT = cast(Operands[1])->getVT().getVectorElementType(); Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0], getValueType(ExtVT))); } } } for (; i < ResNE; ++i) Scalars.push_back(getUNDEF(EltVT)); return getNode(ISD::BUILD_VECTOR, dl, EVT::getVectorVT(*getContext(), EltVT, ResNE), Scalars); } /// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a /// location that is 'Dist' units away from the location that the 'Base' load /// is loading from. bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const { if (LD->getChain() != Base->getChain()) return false; EVT VT = LD->getValueType(0); if (VT.getSizeInBits() / 8 != Bytes) return false; SDValue Loc = LD->getOperand(1); SDValue BaseLoc = Base->getOperand(1); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) return false; const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo(); int FI = cast(Loc)->getIndex(); int BFI = cast(BaseLoc)->getIndex(); int FS = MFI->getObjectSize(FI); int BFS = MFI->getObjectSize(BFI); if (FS != BFS || FS != (int)Bytes) return false; return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } // Handle X + C. if (isBaseWithConstantOffset(Loc)) { int64_t LocOffset = cast(Loc.getOperand(1))->getSExtValue(); if (Loc.getOperand(0) == BaseLoc) { // If the base location is a simple address with no offset itself, then // the second load's first add operand should be the base address. if (LocOffset == Dist * (int)Bytes) return true; } else if (isBaseWithConstantOffset(BaseLoc)) { // The base location itself has an offset, so subtract that value from the // second load's offset before comparing to distance * size. int64_t BOffset = cast(BaseLoc.getOperand(1))->getSExtValue(); if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { if ((LocOffset - BOffset) == Dist * (int)Bytes) return true; } } } const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; int64_t Offset1 = 0; int64_t Offset2 = 0; bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1); bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); if (isGA1 && isGA2 && GV1 == GV2) return Offset1 == (Offset2 + Dist*Bytes); return false; } /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if /// it cannot be inferred. unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { // If this is a GlobalAddress + cst, return the alignment. const GlobalValue *GV; int64_t GVOffset = 0; if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType()); APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0); llvm::computeKnownBits(const_cast(GV), KnownZero, KnownOne, getDataLayout()); unsigned AlignBits = KnownZero.countTrailingOnes(); unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0; if (Align) return MinAlign(Align, GVOffset); } // If this is a direct reference to a stack slot, use information about the // stack slot's alignment. int FrameIdx = 1 << 31; int64_t FrameOffset = 0; if (FrameIndexSDNode *FI = dyn_cast(Ptr)) { FrameIdx = FI->getIndex(); } else if (isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { // Handle FI+Cst FrameIdx = cast(Ptr.getOperand(0))->getIndex(); FrameOffset = Ptr.getConstantOperandVal(1); } if (FrameIdx != (1 << 31)) { const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo(); unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx), FrameOffset); return FIInfoAlign; } return 0; } /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type /// which is split (or expanded) into two not necessarily identical pieces. std::pair SelectionDAG::GetSplitDestVTs(const EVT &VT) const { // Currently all types are split in half. EVT LoVT, HiVT; if (!VT.isVector()) { LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT); } else { unsigned NumElements = VT.getVectorNumElements(); assert(!(NumElements & 1) && "Splitting vector, but not in half!"); LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(), NumElements/2); } return std::make_pair(LoVT, HiVT); } /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the /// low/high part. std::pair SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT) { assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <= N.getValueType().getVectorNumElements() && "More vector elements requested than available!"); SDValue Lo, Hi; Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout()))); Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N, getConstant(LoVT.getVectorNumElements(), DL, TLI->getVectorIdxTy(getDataLayout()))); return std::make_pair(Lo, Hi); } void SelectionDAG::ExtractVectorElements(SDValue Op, SmallVectorImpl &Args, unsigned Start, unsigned Count) { EVT VT = Op.getValueType(); if (Count == 0) Count = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); EVT IdxTy = TLI->getVectorIdxTy(getDataLayout()); SDLoc SL(Op); for (unsigned i = Start, e = Start + Count; i != e; ++i) { Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Op, getConstant(i, SL, IdxTy))); } } // getAddressSpace - Return the address space this GlobalAddress belongs to. unsigned GlobalAddressSDNode::getAddressSpace() const { return getGlobal()->getType()->getAddressSpace(); } Type *ConstantPoolSDNode::getType() const { if (isMachineConstantPoolEntry()) return Val.MachineCPVal->getType(); return Val.ConstVal->getType(); } bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits, bool isBigEndian) const { EVT VT = getValueType(0); assert(VT.isVector() && "Expected a vector type"); unsigned sz = VT.getSizeInBits(); if (MinSplatBits > sz) return false; SplatValue = APInt(sz, 0); SplatUndef = APInt(sz, 0); // Get the bits. Bits with undefined values (when the corresponding element // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared // in SplatValue. If any of the values are not constant, give up and return // false. unsigned int nOps = getNumOperands(); assert(nOps > 0 && "isConstantSplat has 0-size build vector"); unsigned EltBitSize = VT.getVectorElementType().getSizeInBits(); for (unsigned j = 0; j < nOps; ++j) { unsigned i = isBigEndian ? nOps-1-j : j; SDValue OpVal = getOperand(i); unsigned BitPos = j * EltBitSize; if (OpVal.getOpcode() == ISD::UNDEF) SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize); else if (ConstantSDNode *CN = dyn_cast(OpVal)) SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize). zextOrTrunc(sz) << BitPos; else if (ConstantFPSDNode *CN = dyn_cast(OpVal)) SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) < 8) { unsigned HalfSize = sz / 2; APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize); APInt LowValue = SplatValue.trunc(HalfSize); APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize); APInt LowUndef = SplatUndef.trunc(HalfSize); // If the two halves do not match (ignoring undef bits), stop here. if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) || MinSplatBits > HalfSize) break; SplatValue = HighValue | LowValue; SplatUndef = HighUndef & LowUndef; sz = HalfSize; } SplatBitSize = sz; return true; } SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const { if (UndefElements) { UndefElements->clear(); UndefElements->resize(getNumOperands()); } SDValue Splatted; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { SDValue Op = getOperand(i); if (Op.getOpcode() == ISD::UNDEF) { if (UndefElements) (*UndefElements)[i] = true; } else if (!Splatted) { Splatted = Op; } else if (Splatted != Op) { return SDValue(); } } if (!Splatted) { assert(getOperand(0).getOpcode() == ISD::UNDEF && "Can only have a splat without a constant for all undefs."); return getOperand(0); } return Splatted; } ConstantSDNode * BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const { return dyn_cast_or_null(getSplatValue(UndefElements)); } ConstantFPSDNode * BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const { return dyn_cast_or_null(getSplatValue(UndefElements)); } int32_t BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const { if (ConstantFPSDNode *CN = dyn_cast_or_null(getSplatValue(UndefElements))) { bool IsExact; APSInt IntVal(BitWidth); APFloat APF = CN->getValueAPF(); if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) != APFloat::opOK || !IsExact) return -1; return IntVal.exactLogBase2(); } return -1; } bool BuildVectorSDNode::isConstant() const { for (const SDValue &Op : op_values()) { unsigned Opc = Op.getOpcode(); if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP) return false; } return true; } bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { // Find the first non-undef value in the shuffle mask. unsigned i, e; for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i) /* search */; assert(i != e && "VECTOR_SHUFFLE node with all undef indices!"); // Make sure all remaining elements are either undef or the same as the first // non-undef value. for (int Idx = Mask[i]; i != e; ++i) if (Mask[i] >= 0 && Mask[i] != Idx) return false; return true; } #ifndef NDEBUG static void checkForCyclesHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallPtrSetImpl &Checked, const llvm::SelectionDAG *DAG) { // If this node has already been checked, don't check it again. if (Checked.count(N)) return; // If a node has already been visited on this depth-first walk, reject it as // a cycle. if (!Visited.insert(N).second) { errs() << "Detected cycle in SelectionDAG\n"; dbgs() << "Offending node:\n"; N->dumprFull(DAG); dbgs() << "\n"; abort(); } for (const SDValue &Op : N->op_values()) checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG); Checked.insert(N); Visited.erase(N); } #endif void llvm::checkForCycles(const llvm::SDNode *N, const llvm::SelectionDAG *DAG, bool force) { #ifndef NDEBUG bool check = force; #ifdef XDEBUG check = true; #endif // XDEBUG if (check) { assert(N && "Checking nonexistent SDNode"); SmallPtrSet visited; SmallPtrSet checked; checkForCyclesHelper(N, visited, checked, DAG); } #endif // !NDEBUG } void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) { checkForCycles(DAG->getRoot().getNode(), DAG, force); } Index: projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 294609) @@ -1,10164 +1,10165 @@ //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the AArch64TargetLowering class. // //===----------------------------------------------------------------------===// #include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; #define DEBUG_TYPE "aarch64-lower" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); // Place holder until extr generation is tested fully. static cl::opt EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, cl::desc("Allow AArch64 (or (shift)(shift))->extract"), cl::init(true)); static cl::opt EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), cl::init(false)); // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( "aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. setBooleanContents(ZeroOrOneBooleanContent); // When comparing vectors the result sets the different elements in the // vector to all-one or all-zero. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Set up the register classes. addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); } if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); // Someone set us up the NEON. addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); } // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::JumpTable, MVT::i64, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); setOperationAction(ISD::FREM, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f80, Expand); // Custom lowering hooks are needed for XOR // to fold it into CSINC/CSINV. setOperationAction(ISD::XOR, MVT::i32, Custom); setOperationAction(ISD::XOR, MVT::i64, Custom); // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); setOperationAction(ISD::FADD, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); setOperationAction(ISD::FDIV, MVT::f128, Custom); setOperationAction(ISD::FMA, MVT::f128, Expand); setOperationAction(ISD::FMUL, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); setOperationAction(ISD::FRINT, MVT::f128, Expand); setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); setOperationAction(ISD::FSUB, MVT::f128, Custom); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); // Lowering for many of the conversions is actually specified by the non-f128 // type. The LowerXXX function will be trivial when f128 isn't involved. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Custom); setOperationAction(ISD::VAEND, MVT::Other, Expand); // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); // BlockAddress setOperationAction(ISD::BlockAddress, MVT::i64, Custom); // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. setOperationAction(ISD::ADDC, MVT::i32, Custom); setOperationAction(ISD::ADDE, MVT::i32, Custom); setOperationAction(ISD::SUBC, MVT::i32, Custom); setOperationAction(ISD::SUBE, MVT::i32, Custom); setOperationAction(ISD::ADDC, MVT::i64, Custom); setOperationAction(ISD::ADDE, MVT::i64, Custom); setOperationAction(ISD::SUBC, MVT::i64, Custom); setOperationAction(ISD::SUBE, MVT::i64, Custom); // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero // counterparts, which AArch64 supports directly. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); // Custom lower Add/Sub/Mul with overflow. setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::SADDO, MVT::i64, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i64, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i64, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i64, Custom); setOperationAction(ISD::SMULO, MVT::i32, Custom); setOperationAction(ISD::SMULO, MVT::i64, Custom); setOperationAction(ISD::UMULO, MVT::i32, Custom); setOperationAction(ISD::UMULO, MVT::i64, Custom); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // f16 is a storage-only type, always promote it to f32. setOperationAction(ISD::SETCC, MVT::f16, Promote); setOperationAction(ISD::BR_CC, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); setOperationAction(ISD::SELECT, MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::FMA, MVT::f16, Promote); setOperationAction(ISD::FNEG, MVT::f16, Promote); setOperationAction(ISD::FABS, MVT::f16, Promote); setOperationAction(ISD::FCEIL, MVT::f16, Promote); setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); setOperationAction(ISD::FCOS, MVT::f16, Promote); setOperationAction(ISD::FFLOOR, MVT::f16, Promote); setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::f16, Promote); setOperationAction(ISD::FPOWI, MVT::f16, Promote); setOperationAction(ISD::FRINT, MVT::f16, Promote); setOperationAction(ISD::FSIN, MVT::f16, Promote); setOperationAction(ISD::FSINCOS, MVT::f16, Promote); setOperationAction(ISD::FSQRT, MVT::f16, Promote); setOperationAction(ISD::FEXP, MVT::f16, Promote); setOperationAction(ISD::FEXP2, MVT::f16, Promote); setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Promote); setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); setOperationAction(ISD::FMINNAN, MVT::f16, Promote); setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); // v4f16 is also a storage-only type, so promote it to v4f32 when that is // known to be safe. setOperationAction(ISD::FADD, MVT::v4f16, Promote); setOperationAction(ISD::FSUB, MVT::v4f16, Promote); setOperationAction(ISD::FMUL, MVT::v4f16, Promote); setOperationAction(ISD::FDIV, MVT::v4f16, Promote); setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); // Expand all other v4f16 operations. // FIXME: We could generate better code by promoting some operations to // a pair of v4f32s setOperationAction(ISD::FABS, MVT::v4f16, Expand); setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); setOperationAction(ISD::FCOS, MVT::v4f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); setOperationAction(ISD::FMA, MVT::v4f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); setOperationAction(ISD::FPOW, MVT::v4f16, Expand); setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); setOperationAction(ISD::FREM, MVT::v4f16, Expand); setOperationAction(ISD::FROUND, MVT::v4f16, Expand); setOperationAction(ISD::FRINT, MVT::v4f16, Expand); setOperationAction(ISD::FSIN, MVT::v4f16, Expand); setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); setOperationAction(ISD::SETCC, MVT::v4f16, Expand); setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); setOperationAction(ISD::SELECT, MVT::v4f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); setOperationAction(ISD::FEXP, MVT::v4f16, Expand); setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); setOperationAction(ISD::FLOG, MVT::v4f16, Expand); setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); // v8f16 is also a storage-only type, so expand it. setOperationAction(ISD::FABS, MVT::v8f16, Expand); setOperationAction(ISD::FADD, MVT::v8f16, Expand); setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); setOperationAction(ISD::FCOS, MVT::v8f16, Expand); setOperationAction(ISD::FDIV, MVT::v8f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); setOperationAction(ISD::FMA, MVT::v8f16, Expand); setOperationAction(ISD::FMUL, MVT::v8f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Expand); setOperationAction(ISD::FPOW, MVT::v8f16, Expand); setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); setOperationAction(ISD::FREM, MVT::v8f16, Expand); setOperationAction(ISD::FROUND, MVT::v8f16, Expand); setOperationAction(ISD::FRINT, MVT::v8f16, Expand); setOperationAction(ISD::FSIN, MVT::v8f16, Expand); setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); setOperationAction(ISD::FSUB, MVT::v8f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); setOperationAction(ISD::SETCC, MVT::v8f16, Expand); setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); setOperationAction(ISD::SELECT, MVT::v8f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); setOperationAction(ISD::FEXP, MVT::v8f16, Expand); setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); setOperationAction(ISD::FLOG, MVT::v8f16, Expand); setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::f32, MVT::f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); setOperationAction(ISD::FMINNUM, Ty, Legal); setOperationAction(ISD::FMAXNUM, Ty, Legal); setOperationAction(ISD::FMINNAN, Ty, Legal); setOperationAction(ISD::FMAXNAN, Ty, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); if (Subtarget->isTargetMachO()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret to avoid memory // traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } else { setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { setOperationAction(ISD::ConstantFP, MVT::f32, Legal); setOperationAction(ISD::ConstantFP, MVT::f64, Legal); } // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); } for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. setOperationAction(ISD::TRAP, MVT::Other, Legal); // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); // Enable TBZ/TBNZ MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; setMinFunctionAlignment(2); setHasExtractBitsInsn(true); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: setOperationAction(ISD::FABS, MVT::v1f64, Expand); setOperationAction(ISD::FADD, MVT::v1f64, Expand); setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); setOperationAction(ISD::FCOS, MVT::v1f64, Expand); setOperationAction(ISD::FDIV, MVT::v1f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); setOperationAction(ISD::FMA, MVT::v1f64, Expand); setOperationAction(ISD::FMUL, MVT::v1f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); setOperationAction(ISD::FNEG, MVT::v1f64, Expand); setOperationAction(ISD::FPOW, MVT::v1f64, Expand); setOperationAction(ISD::FREM, MVT::v1f64, Expand); setOperationAction(ISD::FROUND, MVT::v1f64, Expand); setOperationAction(ISD::FRINT, MVT::v1f64, Expand); setOperationAction(ISD::FSIN, MVT::v1f64, Expand); setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); setOperationAction(ISD::FSUB, MVT::v1f64, Expand); setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); setOperationAction(ISD::SETCC, MVT::v1f64, Expand); setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); setOperationAction(ISD::SELECT, MVT::v1f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); setOperationAction(ISD::MUL, MVT::v1i64, Expand); // AArch64 doesn't have a direct vector ->f32 conversion instructions for // elements smaller than i32, so promote the input to i32 first. setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 // -> v8f16 conversions. setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); // Similarly, there is no direct i32 -> f64 vector conversion instruction. setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); // Or, direct i32 -> f16 vector conversion. Set it so custom, so the // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } } // Prefer likely predicted branches to selects on out-of-order cores. if (Subtarget->isCortexA57()) PredictableSelectIsExpensive = true; } void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { if (VT == MVT::v2f32 || VT == MVT::v4f16) { setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); // But we do support custom-lowering for FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); for (MVT InnerVT : MVT::all_valuetypes()) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); // [SU][MIN|MAX] are available for all NEON types apart from i64. if (!VT.isFloatingPoint() && VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT.getSimpleVT(), Legal); // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, ISD::FMINNUM, ISD::FMAXNUM}) setOperationAction(Opcode, VT.getSimpleVT(), Legal); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT.getSimpleVT(), Legal); setIndexedStoreAction(im, VT.getSimpleVT(), Legal); } } } void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); } void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR128RegClass); addTypeForNEON(VT, MVT::v4i32); } EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. void AArch64TargetLowering::computeKnownBitsForTargetNode( const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case AArch64ISD::CSEL: { APInt KnownZero2, KnownOne2; DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); KnownZero &= KnownZero2; KnownOne &= KnownOne2; break; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { unsigned BitWidth = KnownOne.getBitWidth(); EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarType().getSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } break; } case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_VOID: { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: break; case Intrinsic::aarch64_neon_umaxv: case Intrinsic::aarch64_neon_uminv: { // Figure out the datatype of the vector operand. The UMINV instruction // will zero extend the result, so we can mark as known zero all the // bits larger than the element datatype. 32-bit or larget doesn't need // this as those are legal types and will be handled by isel directly. MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); unsigned BitWidth = KnownZero.getBitWidth(); if (VT == MVT::v8i8 || VT == MVT::v16i8) { assert(BitWidth >= 8 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); KnownZero |= Mask; } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { assert(BitWidth >= 16 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); KnownZero |= Mask; } break; } break; } } } } MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, EVT) const { return MVT::i64; } bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; // FIXME: This is mostly true for Cyclone, but not necessarily others. if (Fast) { // FIXME: Define an attribute for slow unaligned accesses instead of // relying on the CPU type as a proxy. // On Cyclone, unaligned 128-bit stores are slow. *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. Align <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. VT == MVT::v2i64; } return true; } FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return AArch64::createFastISel(funcInfo, libInfo); } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; case AArch64ISD::CALL: return "AArch64ISD::CALL"; case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; case AArch64ISD::ADC: return "AArch64ISD::ADC"; case AArch64ISD::SBC: return "AArch64ISD::SBC"; case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; case AArch64ISD::BICi: return "AArch64ISD::BICi"; case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; case AArch64ISD::BSL: return "AArch64ISD::BSL"; case AArch64ISD::NEG: return "AArch64ISD::NEG"; case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; case AArch64ISD::REV16: return "AArch64ISD::REV16"; case AArch64ISD::REV32: return "AArch64ISD::REV32"; case AArch64ISD::REV64: return "AArch64ISD::REV64"; case AArch64ISD::EXT: return "AArch64ISD::EXT"; case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; } return nullptr; } MachineBasicBlock * AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: // OrigBB: // [... previous instrs leading to comparison ...] // b.ne TrueBB // b EndBB // TrueBB: // ; Fallthrough // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI->getOperand(0).getReg(); unsigned IfTrueReg = MI->getOperand(1).getReg(); unsigned IfFalseReg = MI->getOperand(2).getReg(); unsigned CondCode = MI->getOperand(3).getImm(); bool NZCVKilled = MI->getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, TrueBB); MF->insert(It, EndBB); // Transfer rest of current basic-block to EndBB EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndBB->transferSuccessorsAndUpdatePHIs(MBB); BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MBB->addSuccessor(TrueBB); MBB->addSuccessor(EndBB); // TrueBB falls through to the end. TrueBB->addSuccessor(EndBB); if (!NZCVKilled) { TrueBB->addLiveIn(AArch64::NZCV); EndBB->addLiveIn(AArch64::NZCV); } BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) .addReg(IfTrueReg) .addMBB(TrueBB) .addReg(IfFalseReg) .addMBB(MBB); MI->eraseFromParent(); return EndBB; } MachineBasicBlock * AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { default: #ifndef NDEBUG MI->dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); } } //===----------------------------------------------------------------------===// // AArch64 Lowering private implementation. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 /// CC static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return AArch64CC::NE; case ISD::SETEQ: return AArch64CC::EQ; case ISD::SETGT: return AArch64CC::GT; case ISD::SETGE: return AArch64CC::GE; case ISD::SETLT: return AArch64CC::LT; case ISD::SETLE: return AArch64CC::LE; case ISD::SETUGT: return AArch64CC::HI; case ISD::SETUGE: return AArch64CC::HS; case ISD::SETULT: return AArch64CC::LO; case ISD::SETULE: return AArch64CC::LS; } } /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = AArch64CC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = AArch64CC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = AArch64CC::GE; break; case ISD::SETOLT: CondCode = AArch64CC::MI; break; case ISD::SETOLE: CondCode = AArch64CC::LS; break; case ISD::SETONE: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GT; break; case ISD::SETO: CondCode = AArch64CC::VC; break; case ISD::SETUO: CondCode = AArch64CC::VS; break; case ISD::SETUEQ: CondCode = AArch64CC::EQ; CondCode2 = AArch64CC::VS; break; case ISD::SETUGT: CondCode = AArch64CC::HI; break; case ISD::SETUGE: CondCode = AArch64CC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = AArch64CC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = AArch64CC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = AArch64CC::NE; break; } } /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations /// to get the same effect. static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert) { Invert = false; switch (CC) { default: // Mostly the scalar mappings work fine. changeFPCCToAArch64CC(CC, CondCode, CondCode2); break; case ISD::SETUO: Invert = true; // Fallthrough case ISD::SETO: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GE; break; case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE: case ISD::SETUGT: case ISD::SETUGE: // All of the compare-mask comparisons are ordered, but we can switch // between the two by a double inversion. E.g. ULE == !OGT. Invert = true; changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); break; } } static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); if (VT.isFloatingPoint()) return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. // A later phase can perform the optimization of setting the destination // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags // can be set differently by this operation. It comes down to whether // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then // everything is fine. If not then the optimization is wrong. Thus general // comparisons are only valid if op2 != 0. // So, finally, the only LLVM-native comparisons that don't mention C and V // are SETEQ and SETNE. They're the only ones we can safely use CMN for in // the absence of information about op2. Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one // of the signed comparisons. Opcode = AArch64ISD::ANDS; RHS = LHS.getOperand(1); LHS = LHS.getOperand(0); } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } /// \defgroup AArch64CCMP CMP;CCMP matching /// /// These functions deal with the formation of CMP;CCMP;... sequences. /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of /// a comparison. They set the NZCV flags to a predefined value if their /// predicate is false. This allows to express arbitrary conjunctions, for /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" /// expressed as: /// cmp A /// ccmp B, inv(CB), CA /// check for CB flags /// /// In general we can create code for arbitrary "... (and (and A B) C)" /// sequences. We can also implement some "or" expressions, because "(or A B)" /// is equivalent to "not (and (not A) (not B))" and we can implement some /// negation operations: /// We can negate the results of a single comparison by inverting the flags /// used when the predicate fails and inverting the flags tested in the next /// instruction; We can also negate the results of the whole previous /// conditional compare sequence by inverting the flags tested in the next /// instruction. However there is no way to negate the result of a partial /// sequence. /// /// Therefore on encountering an "or" expression we can negate the subtree on /// one side and have to be able to push the negate to the leafs of the subtree /// on the other side (see also the comments in code). As complete example: /// "or (or (setCA (cmp A)) (setCB (cmp B))) /// (and (setCC (cmp C)) (setCD (cmp D)))" /// is transformed to /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" /// and implemented as: /// cmp C /// ccmp D, inv(CD), CC /// ccmp A, CA, inv(CD) /// ccmp B, CB, inv(CA) /// check for CB flags /// A counterexample is "or (and A B) (and C D)" which cannot be implemented /// by conditional compare sequences. /// @{ /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, SDValue Condition, unsigned NZCV, SDLoc DL, SelectionDAG &DAG) { unsigned Opcode = 0; if (LHS.getValueType().isFloatingPoint()) Opcode = AArch64ISD::FCCMP; else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // See emitComparison() on why we can only do this for SETEQ and SETNE. Opcode = AArch64ISD::CCMN; RHS = RHS.getOperand(1); } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } /// Returns true if @p Val is a tree of AND/OR/SETCC operations. /// CanPushNegate is set to true if we can push a negate operation through /// the tree in a was that we are left with AND operations and negate operations /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be /// brought into such a form. static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { CanPushNegate = true; return true; } // Protect against stack overflow. if (Depth > 15) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanPushNegateL; if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) return false; bool CanPushNegateR; if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) return false; // We cannot push a negate through an AND operation (it would become an OR), // we can however change a (not (or x y)) to (and (not x) (not y)) if we can // push the negate through the x/y subtrees. CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; return true; } return false; } /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain /// of CCMP/CFCMP ops. See @ref AArch64CCMP. /// Tries to transform the given i1 producing node @p Val to a series compare /// and conditional compare operations. @returns an NZCV flags producing node /// and sets @p OutCC to the flags that should be tested or returns SDValue() if /// transformation was not possible. /// On recursive invocations @p PushNegate may be set to true to have negation /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate /// for the comparisons in the current subtree; @p Depth limits the search /// depth to avoid stack overflow. static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool PushNegate = false, SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, unsigned Depth = 0) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); if (PushNegate) CC = getSetCCInverse(CC, isInteger); SDLoc DL(Val); // Determine OutCC and handle FP special case. if (isInteger) { OutCC = changeIntCCToAArch64CC(CC); } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; changeFPCCToAArch64CC(CC, OutCC, ExtraCC); // Surpisingly some floating point conditions can't be tested with a // single condition code. Construct an additional comparison in this case. // See comment below on how we deal with OR conditions. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); else { SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); // Note that we want the inverse of ExtraCC, so NZCV is not inversed. unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, DAG); } CCOp = ExtraCmp; Predicate = AArch64CC::getInvertedCondCode(ExtraCC); OutCC = AArch64CC::getInvertedCondCode(OutCC); } } // Produce a normal comparison if we are first in the chain if (!CCOp.getNode()) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, DAG); } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) return SDValue(); assert((Opcode == ISD::OR || !PushNegate) && "Can only push negate through OR operation"); // Check if both sides can be transformed. SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); bool CanPushNegateL; if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) return SDValue(); bool CanPushNegateR; if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) return SDValue(); // Do we need to negate our operands? bool NegateOperands = Opcode == ISD::OR; // We can negate the results of all previous operations by inverting the // predicate flags giving us a free negation for one side. For the other side // we need to be able to push the negation to the leafs of the tree. if (NegateOperands) { if (!CanPushNegateL && !CanPushNegateR) return SDValue(); // Order the side where we can push the negate through to LHS. if (!CanPushNegateL && CanPushNegateR) std::swap(LHS, RHS); } else { bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; if (NeedsNegOutL && NeedsNegOutR) return SDValue(); // Order the side where we need to negate the output flags to RHS so it // gets emitted first. if (NeedsNegOutL) std::swap(LHS, RHS); } // Emit RHS. If we want to negate the tree we only need to push a negate // through if we are already in a PushNegate case, otherwise we can negate // the "flags to test" afterwards. AArch64CC::CondCode RHSCC; SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, CCOp, Predicate, Depth+1); if (NegateOperands && !PushNegate) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); // Emit LHS. We must push the negate through if we need to negate it. SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, CmpR, RHSCC, Depth+1); // If we transformed an OR to and AND then we have to negate the result // (or absorb a PushNegate resulting in a double negation). if (Opcode == ISD::OR && !PushNegate) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } /// @} static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); if (!isLegalArithImmed(C)) { // Constant does not fit, try adjusting it by one? switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if ((VT == MVT::i32 && C != 0x80000000 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0x80000000ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULT: case ISD::SETUGE: if ((VT == MVT::i32 && C != 0 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETLE: case ISD::SETGT: if ((VT == MVT::i32 && C != INT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != INT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULE: case ISD::SETUGT: if ((VT == MVT::i32 && C != UINT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != UINT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; } } } SDValue Cmp; AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { const ConstantSDNode *RHSC = cast(RHS); // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. // For the i8 operand, the largest immediate is 255, so this can be easily // encoded in the compare instruction. For the i16 operand, however, the // largest immediate cannot be encoded in the compare. // Therefore, use a sign extending load and cmn to avoid materializing the // -1 constant. For example, // movz w1, #65535 // ldrh w0, [x0, #0] // cmp w0, w1 // > // ldrsh w0, [x0, #0] // cmn w0, #1 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) // if and only if (sext LHS) == (sext RHS). The checks are in place to // ensure both the LHS and RHS are truly zero extended and to make sure the // transformation is profitable. if ((RHSC->getZExtValue() >> 16 == 0) && isa(LHS) && cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && cast(LHS)->getMemoryVT() == MVT::i16 && LHS.getNode()->hasNUsesOfValue(1, 0)) { int16_t ValueofRHS = cast(RHS)->getZExtValue(); if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, DAG.getValueType(MVT::i16)); Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, RHS.getValueType()), CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } } if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } } } if (!Cmp) { Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } static std::pair getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && "Unsupported value type"); SDValue Value, Overflow; SDLoc DL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned Opc = 0; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::VS; break; case ISD::UADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::HS; break; case ISD::SSUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::VS; break; case ISD::USUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::LO; break; // Multiply needs a little bit extra work. case ISD::SMULO: case ISD::UMULO: { CC = AArch64CC::NE; bool IsSigned = Op.getOpcode() == ISD::SMULO; if (Op.getValueType() == MVT::i32) { unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; // For a 32 bit multiply with overflow check we want the instruction // selector to generate a widening multiply (SMADDL/UMADDL). For that we // need to generate the following pattern: // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, DAG.getConstant(0, DL, MVT::i64)); // On AArch64 the upper 32 bits are always zero extended for a 32 bit // operation. We need to clear out the upper 32 bits, because we used a // widening multiply that wrote all 64 bits. In the end this should be a // noop. Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); if (IsSigned) { // The signed overflow check requires more than just a simple check for // any bit set in the upper 32 bits of the result. These bits could be // just the sign bits of a negative number. To perform the overflow // check we have to arithmetic shift right the 32nd bit of the result by // 31 bits. Then we compare the result to the upper 32 bits. SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, DAG.getConstant(32, DL, MVT::i64)); UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, DAG.getConstant(31, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { // The overflow check for unsigned multiply is easy. We only need to // check if any of the upper 32 bits are set. This can be done with a // CMP (shifted register). For that we need to generate the following // pattern: // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, DL, MVT::i64)); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); // For the 64 bit multiply Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); if (IsSigned) { SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } } // switch (...) if (Opc) { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); // Emit the AArch64 operation with overflow check. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { SDValue Sel = Op.getOperand(0); SDValue Other = Op.getOperand(1); // If neither operand is a SELECT_CC, give up. if (Sel.getOpcode() != ISD::SELECT_CC) std::swap(Sel, Other); if (Sel.getOpcode() != ISD::SELECT_CC) return Op; // The folding we want to perform is: // (xor x, (select_cc a, b, cc, 0, -1) ) // --> // (csel x, (xor x, -1), cc ...) // // The latter will get matched to a CSINV instruction. ISD::CondCode CC = cast(Sel.getOperand(4))->get(); SDValue LHS = Sel.getOperand(0); SDValue RHS = Sel.getOperand(1); SDValue TVal = Sel.getOperand(2); SDValue FVal = Sel.getOperand(3); SDLoc dl(Sel); // FIXME: This could be generalized to non-integer comparisons. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return Op; ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); // The values aren't constants, this isn't the pattern we're looking for. if (!CFVal || !CTVal) return Op; // We can commute the SELECT_CC by inverting the condition. This // might be needed to make this fit into a CSINV pattern. if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } // If the constants line up, perform the transform! if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); FVal = Other; TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, DAG.getConstant(-1ULL, dl, Other.getValueType())); return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, CCVal, Cmp); } return Op; } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = AArch64ISD::ADDS; break; case ISD::SUBC: Opc = AArch64ISD::SUBS; break; case ISD::ADDE: Opc = AArch64ISD::ADCS; ExtraOp = true; break; case ISD::SUBE: Opc = AArch64ISD::SBCS; ExtraOp = true; break; } if (!ExtraOp) return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDLoc dl(Op); AArch64CC::CondCode CC; // The actual operation that sets the overflow or carry flag. SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, CCVal, Overflow); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } // Prefetch operands are: // 1: Address to prefetch // 2: bool isWrite // 3: int locality (0 = no locality ... 3 = extreme locality) // 4: bool isDataCache static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); unsigned IsWrite = cast(Op.getOperand(2))->getZExtValue(); unsigned Locality = cast(Op.getOperand(3))->getZExtValue(); unsigned IsData = cast(Op.getOperand(4))->getZExtValue(); bool IsStream = !Locality; // When the locality number is set if (Locality) { // The front-end should have filtered out the out-of-range values assert(Locality <= 3 && "Prefetch locality out-of-range"); // The locality degree is the opposite of the cache speed. // Put the number the other way around. // The encoding starts at 0 for level 1 Locality = 3 - Locality; } // built the mask value encoding the expected behavior. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit (!IsData << 3) | // IsDataCache bit (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); unsigned NumElts = InVT.getVectorNumElements(); // f16 vectors are promoted to f32 before a conversion. if (InVT.getVectorElementType() == MVT::f16) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { SDLoc dl(Op); MVT ExtVT = MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } // Type changing conversions are illegal. return Op; } SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(0).getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32. if (Op.getOperand(0).getValueType() == MVT::f16) { SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); } if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. EVT VT = Op.getValueType(); SDLoc dl(Op); SDValue In = Op.getOperand(0); EVT InVT = In.getValueType(); if (VT.getSizeInBits() < InVT.getSizeInBits()) { MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { unsigned CastOpc = Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = VT.changeVectorElementTypeToInteger(); In = DAG.getNode(CastOpc, dl, CastVT, In); return DAG.getNode(Op.getOpcode(), dl, VT, In); } return Op; } SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); // f16 conversions are promoted to f32. if (Op.getValueType() == MVT::f16) { SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. if (Op.getOperand(0).getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based // fp128. if (Op.getValueType() != MVT::f128) return Op; RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { // For iOS, we want to call an alternative entry point: __sincos_stret, // which returns the values in two S / D registers. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); ArgListTy Args; ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); const char *LibcallName = (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { if (Op.getValueType() != MVT::f16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); SDLoc DL(Op); Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (const SDValue &Elt : N->op_values()) { if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); SDLoc dl(N); unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::getVectorVT(TruncVT, NumElts), Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND) return true; if (isExtendedBUILD_VECTOR(N, DAG, true)) return true; return false; } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::ZERO_EXTEND) return true; if (isExtendedBUILD_VECTOR(N, DAG, false)) return true; return false; } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); if (isN0SExt && isN1SExt) NewOpc = AArch64ISD::SMULL; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); if (isN0ZExt && isN1ZExt) NewOpc = AArch64ISD::UMULL; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (isN1SExt && isAddSubSExt(N0, DAG)) { NewOpc = AArch64ISD::SMULL; isMLA = true; } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { NewOpc = AArch64ISD::UMULL; isMLA = true; } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); NewOpc = AArch64ISD::UMULL; isMLA = true; } } if (!NewOpc) { if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); else // Other vector multiplications are legal. return Op; } } // Legalize to a S/UMULL instruction SDLoc DL(Op); SDValue Op0; SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); if (!isMLA) { Op0 = skipExtensionForVectorMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during // isel lowering to take advantage of no-stall back to back s/umul + s/umla. // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::aarch64_thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umax: return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_smin: return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umin: return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } } SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("unimplemented operand"); return SDValue(); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: return LowerF128Call(Op, DAG, RTLIB::ADD_F128); case ISD::FSUB: return LowerF128Call(Op, DAG, RTLIB::SUB_F128); case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerVectorSRA_SRL_SHL(Op, DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTPOP: return LowerCTPOP(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::AND: return LowerVectorAND(Op, DAG); case ISD::OR: return LowerVectorOR(Op, DAG); case ISD::XOR: return LowerXOR(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); } } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// #include "AArch64GenCallingConv.inc" /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { default: llvm_unreachable("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; } } SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // At this point, Ins[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeFormalArguments to pass in ValVT and // LocVT. unsigned NumArgs = Ins.size(); Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; if (Ins[i].isOrigArg()) { std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[i].getOrigArgIndex(); // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; } CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } assert(ArgLocs.size() == Ins.size()); SmallVector ArgValues; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. EVT PtrVT = getPointerTy(DAG.getDataLayout()); int Size = Ins[i].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; // FIXME: This works on big-endian for composite byvals, which are the common // case. It should also work for fundamental types too. unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); continue; } if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); SDValue ArgValue; const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; else if (RegVT == MVT::f16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; else if (RegVT == MVT::f64 || RegVT.is64BitVector()) RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted // to 64 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt // nodes after our lowering. assert(RegVT == Ins[i].VT && "incorrect register location selected"); break; } InVals.push_back(ArgValue); } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; MVT MemVT = VA.getValVT(); switch (VA.getLocInfo()) { default: break; case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; case CCValAssign::ZExt: ExtType = ISD::ZEXTLOAD; break; case CCValAssign::AExt: ExtType = ISD::EXTLOAD; break; } ArgValue = DAG.getExtLoad( ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT, false, false, false, 0); InVals.push_back(ArgValue); } } // varargs if (isVarArg) { if (!Subtarget->isTargetDarwin()) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } AArch64FunctionInfo *AFI = MF.getInfo(); // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); } AArch64FunctionInfo *FuncInfo = MF.getInfo(); unsigned StackArgSize = CCInfo.getNextStackOffset(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: StackArgSize = RoundUpToAlignment(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. FuncInfo->setArgumentStackToRestore(StackArgSize); // This realignment carries over to the available bytes below. Our own // callers will guarantee the space is free by giving an aligned value to // CALLSEQ_START. } // Even if we're not expected to free up the space, it's useful to know how // much is there while considering tail calls (because we can reuse it). FuncInfo->setBytesInStackArgArea(StackArgSize); return Chain; } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SmallVector MemOps; static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); if (Subtarget->hasFPARMv8()) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); FuncInfo->setVarArgsFPRSize(FPRSaveSize); } if (!MemOps.empty()) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } SDValue Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { // For CallingConv::C this function knows whether the ABI needs // changing. That's not true for other conventions so they will have to opt in // manually. if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. for (Function::const_arg_iterator i = CallerF->arg_begin(), e = CallerF->arg_end(); i != e; ++i) if (i->hasByValAttr()) return false; if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; } // Externally-defined functions with weak linkage should not be // tail-called on AArch64 when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // Now we search for cases where we can use a tail call without changing the // ABI. Sibcall is used in some places (particularly gcc) to refer to this // concept. // I want anyone implementing a new calling convention to think long and hard // about this assert. assert((!isVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); if (isVarArg && !Outs.empty()) { // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If // caller is C then we could potentially use its argument area. // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (const CCValAssign &ArgLoc : ArgLocs) if (!ArgLoc.isRegLoc()) return false; } // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); SmallVector RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); if (RVLocs1.size() != RVLocs2.size()) return false; for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) return false; if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) return false; if (RVLocs1[i].isRegLoc()) { if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; } else { if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; } } } // Nothing more to check if the callee is taking no arguments if (Outs.empty()) return true; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); const AArch64FunctionInfo *FuncInfo = MF.getInfo(); // If the stack arguments for this call would fit into our own save area then // the call can be made tail. return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo *MFI, int ClobberedFI) const { SmallVector ArgChains; int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the // CALLSEQ_BEGIN node. ArgChains.push_back(Chain); // Add a chain value for each stack argument corresponding for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) if (LoadSDNode *L = dyn_cast(*U)) if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) if (FI->getIndex() < 0) { int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); int64_t InLastByte = InFirstByte; InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || (FirstByte <= InFirstByte && InFirstByte <= LastByte)) ArgChains.push_back(SDValue(L, 1)); } // Build a tokenfactor for all the chains. return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); } bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { return CallCC == CallingConv::Fast && TailCallOpt; } bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { return CallCC == CallingConv::Fast; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SmallVector &Outs = CLI.Outs; SmallVector &OutVals = CLI.OutVals; SmallVector &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; bool IsSibCall = false; if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: if (!TailCallOpt && IsTailCall) IsSibCall = true; if (IsTailCall) ++NumTailCalls; } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); if (IsVarArg) { // Handle fixed and variable vector arguments differently. // Variable vector arguments always go into memory. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/ !Outs[i].IsFixed); bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } else { // At this point, Outs[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeCallOperands to pass in ValVT and // LocVT. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Outs[i].VT; // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CLI.getArgs()[Outs[i].OrigArgIndex].Ty, /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (IsSibCall) { // Since we're not changing the ABI to make this a tail call, the memory // operands are already available in the caller's incoming argument space. NumBytes = 0; } // FPDiff is the byte offset of the call's argument area from the callee's. // Stores to callee stack arguments will be placed in FixedStackSlots offset // by this amount for a tail call. In a sibling call it must be 0 because the // caller will deallocate the entire stack and the callee still expects its // arguments to begin at SP+0. Completely unused for non-tail calls. int FPDiff = 0; if (IsTailCall && !IsSibCall) { unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. NumBytes = RoundUpToAlignment(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we // can actually shrink the stack. FPDiff = NumReusableBytes - NumBytes; // The stack pointer must be 16-byte aligned at all times it's used for a // memory operation, which in practice means at *all* times and in // particular across call boundaries. Therefore our own arguments started at // a 16-byte aligned SP and the delta applied for the tail call should // satisfy the same constraint. assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); SmallVector, 8> RegsToPass; SmallVector MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: if (Outs[realArgIdx].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to 8-bits by the caller. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; } if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { assert(VA.getLocVT() == MVT::i64 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i64 && "unexpected use of 'returned'"); IsThisReturn = true; } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { assert(VA.isMemLoc()); SDValue DstAddr; MachinePointerInfo DstInfo; // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); if (IsTailCall) { Offset = Offset + FPDiff; int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be // clobbered. Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); } else { SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already // promoted to a legal register type i32, we should truncate Arg back to // i1/i8/i16. if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); MemOpChains.push_back(Store); } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (auto &RegToPass : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InFlag); InFlag = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. if (getTargetMachine().getCodeModel() == CodeModel::Large && Subtarget->isTargetMachO()) { if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); bool InternalLinkage = GV->hasInternalLinkage(); if (InternalLinkage) Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); else { Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(0, DL, true), InFlag, DL); InFlag = Chain.getValue(1); } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual // consumption by emitEpilogue. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable Mask = TRI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { IsThisReturn = false; Mask = TRI->getCallPreservedMask(MF, CallConv); } } else Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. if (IsTailCall) { MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? RoundUpToAlignment(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(CalleePopBytes, DL, true), InFlag, DL); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, InVals, IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); } bool AArch64TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } SDValue AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, SDLoc DL, SelectionDAG &DAG) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC); // Copy the result values into the output registers. SDValue Flag; SmallVector RetOps(1, Chain); for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: if (Outs[i].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to i8 by the producer of the // value. This is strictly redundant on Darwin (which uses "zeroext // i1"), but will be optimised out before ISel. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); } break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (AArch64::GPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else if (AArch64::FPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); } //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); assert(cast(Op)->getOffset() == 0 && "unexpected offset in global node"); // This also catched the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); } if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { assert(getTargetMachine().getCodeModel() == CodeModel::Small && "use of MO_CONSTPOOL only supported on small model"); SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); SDValue GlobalAddr = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), PoolAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /*isVolatile=*/false, /*isNonTemporal=*/true, /*isInvariant=*/true, 8); if (GN->getOffset() != 0) return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, DAG.getConstant(GN->getOffset(), DL, PtrVT)); return GlobalAddr; } if (getTargetMachine().getCodeModel() == CodeModel::Large) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, PtrVT, DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); } else { // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and // the only correct model on Darwin. SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags | AArch64II::MO_PAGE); unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); } } /// \brief Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address (for Darwin, currently) and /// return an SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i64] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first xword, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "x0". /// /// Since this descriptor may be in a different unit, in general even the /// descriptor must be accessed via an indirect load. The "ideal" code sequence /// is: /// adrp x0, _var@TLVPPAGE /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, /// ; the function pointer /// blr x1 ; Uses descriptor address in x0 /// ; Address of _var is now in x0. /// /// If the address of _var's descriptor *is* known to the linker, then it can /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for /// a slight efficiency gain. SDValue AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); const GlobalValue *GV = cast(Op)->getGlobal(); SDValue TLVPAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, true, true, 8); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). const uint32_t *Mask = Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); } /// When accessing thread-local variables under either the general-dynamic or /// local-dynamic system, we make a "TLS-descriptor" call. The variable will /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry /// is a function pointer to carry out the resolution. /// /// The sequence is: /// adrp x0, :tlsdesc:var /// ldr x1, [x0, #:tlsdesc_lo12:var] /// add x0, x0, #:tlsdesc_lo12:var /// .tlsdesccall var /// blr x1 /// (TPIDR_EL0 offset now in x0) /// /// The above sequence must be produced unscheduled, to enable the linker to /// optimize/relax this sequence. /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector Ops; Ops.push_back(Chain); Ops.push_back(SymAddr); Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); } SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); assert(getTargetMachine().getCodeModel() == CodeModel::Small && "ELF TLS only supported in small memory model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the // maximum TLS size is 4GiB. // FIXME: add -mtls-size command line option and make it control the 16MiB // vs. 4GiB code sequence generation. const GlobalAddressSDNode *GA = cast(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; } SDValue TPOff; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); if (Model == TLSModel::LocalExec) { SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue TPWithOff_lo = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); SDValue TPWithOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); return TPWithOff; } else if (Model == TLSModel::InitialExec) { TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); } else if (Model == TLSModel::LocalDynamic) { // Local-dynamic accesses proceed in two phases. A general-dynamic TLS // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate // the beginning of the module's TLS region, followed by a DTPREL offset // calculation. // These accesses will need deduplicating if there's more than one. AArch64FunctionInfo *MFI = DAG.getMachineFunction().getInfo(); MFI->incNumLocalDynamicTLSAccesses(); // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS); // Now we can calculate the offset from TPIDR_EL0 to this module's // thread-local area. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); // Now use :dtprel_whatever: operations to calculate this variable's offset // in its thread-storage area. SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); } else if (Model == TLSModel::GeneralDynamic) { // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); // Finally we can make a call to calculate the offset from tpidr_el0. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); } else llvm_unreachable("Unsupported ELF TLS access model"); return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); } SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); else if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); } SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); // Handle f128 first, since lowering it will result in comparing the return // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); if (LHS.getResNo() == 1 && isOneConstant(RHS) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unexpected condition code."); // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); // The actual operation with overflow check. AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Overflow); } if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); // If the RHS of the comparison is zero, we can potentially fold this // to a specialized branch. const ConstantSDNode *RHSC = dyn_cast(RHS); if (RHSC && RHSC->getZExtValue() == 0) { if (CC == ISD::SETEQ) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETNE) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && LHS.getOpcode() != ISD::AND) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, Cmp); } return BR1; } SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); if (SrcVT.bitsLT(VT)) In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); else if (SrcVT.bitsGT(VT)) In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; EVT EltVT; uint64_t EltMask; SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, DAG.getUNDEF(VecVT), In1); VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } } else if (VT == MVT::f64 || VT == MVT::v2f64) { EltVT = MVT::i64; VecVT = MVT::v2i64; // We want to materialize a mask with the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, DAG.getUNDEF(VecVT), In1); VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } } else { llvm_unreachable("Invalid type for copysign!"); } SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); // If we couldn't materialize the mask above, then the mask vector will be // the zero vector, and we need to negate it here. if (VT == MVT::f64 || VT == MVT::v2f64) { BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); } SDValue Sel = DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); else if (VT == MVT::f64) return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); else return DAG.getNode(ISD::BITCAST, DL, VT, Sel); } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (DAG.getMachineFunction().getFunction()->hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) return SDValue(); // While there is no integer popcount instruction, it can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from // the AdvSIMD registers are cheap. // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd // CNT V0.8B, V0.8B // 8xbyte pop-counts // ADDV B0, V0.8B // sum 8xbyte pop-counts // UMOV X0, V0.B[0] // copy byte result back to integer reg SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); if (VT == MVT::i32) Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; } SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. EVT VT = Op.getValueType(); SDValue TVal = DAG.getConstant(1, dl, VT); SDValue FVal = DAG.getConstant(0, dl, VT); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); return LHS; } } if (LHS.getValueType().isInteger()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in // this case, we emit the first CSEL and then emit a second using the output // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, SDLoc dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Also handle f16, for which we need to do a f32 comparison. if (LHS.getValueType() == MVT::f16) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); } // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in // order to for a CSINV or CSINC out of them. ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } } else if (CTVal && CFVal) { const int64_t TrueVal = CTVal->getSExtValue(); const int64_t FalseVal = CFVal->getSExtValue(); bool Swap = false; // If both TVal and FVal are constants, see if FVal is the // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC // instead of a CSEL in that case. if (TrueVal == ~FalseVal) { Opcode = AArch64ISD::CSINV; } else if (TrueVal == -FalseVal) { Opcode = AArch64ISD::CSNEG; } else if (TVal.getValueType() == MVT::i32) { // If our operands are only 32-bit wide, make sure we use 32-bit // arithmetic for the check whether we can use CSINC. This ensures that // the addition in the check will wrap around properly in case there is // an overflow (which would not be the case if we do the check with // 64-bit arithmetic). const uint32_t TrueVal32 = CTVal->getZExtValue(); const uint32_t FalseVal32 = CFVal->getZExtValue(); if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { Opcode = AArch64ISD::CSINC; if (TrueVal32 > FalseVal32) { Swap = true; } } // 64-bit check whether we can use CSINC. } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { Opcode = AArch64ISD::CSINC; if (TrueVal > FalseVal) { Swap = true; } } // Swap TVal and FVal if necessary. if (Swap) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } if (Opcode != AArch64ISD::CSEL) { // Drop FVal since we can get its value by simply inverting/negating // TVal. FVal = TVal; } } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); EVT VT = TVal.getValueType(); return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two CSELs to implement. AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } // Otherwise, return the output of the first CSEL. return CS1; } SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); SDLoc DL(Op); return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); SDLoc DL(Op); unsigned Opc = CCVal.getOpcode(); // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. if (CCVal.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // Lower it the same way as we would lower a SELECT_CC node. ISD::CondCode CC; SDValue LHS, RHS; if (CCVal.getOpcode() == ISD::SETCC) { LHS = CCVal.getOperand(0); RHS = CCVal.getOperand(1); CC = cast(CCVal->getOperand(2))->get(); } else { LHS = CCVal; RHS = DAG.getConstant(0, DL, CCVal.getValueType()); CC = ISD::SETNE; } return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, PtrVT, DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G0 | MO_NC)); } SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { SDValue GotAddr = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_GOT); return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); } const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, PtrVT, DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_G3), DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_G2 | MO_NC), DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_G1 | MO_NC), DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_G0 | MO_NC)); } else { // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on // ELF, the only valid one on Darwin. SDValue Hi = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_PAGE); SDValue Lo = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast(Op)->getBlockAddress(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, PtrVT, DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); } else { SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); } } SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const { AArch64FunctionInfo *FuncInfo = DAG.getMachineFunction().getInfo(); SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAList = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); SmallVector MemOps; // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), false, false, 8)); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); if (GPRSize > 0) { SDValue GRTop, GRTopAddr; GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, 8), false, false, 8)); } // void *__vr_top at offset 16 int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(16, DL, PtrVT)); VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, 16), false, false, 8)); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, 24), false, false, 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, 28), false, false, 4)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) : LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; const Value *DestSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(VaListSize, DL, MVT::i32), 8, false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "automatic va_arg instruction only works on Darwin"); const Value *V = cast(Op.getOperand(2))->getValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), false, false, false, 0); Chain = VAList.getValue(1); if (Align > 8) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, DAG.getConstant(-(int64_t)Align, DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) ArgSize = 8; bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; NeedFPTrunc = true; } // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), false, false, 0); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo(), false, false, false, 0); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL)); SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; // Merge the rounded value with the chain output of the load. return DAG.getMergeValues(Ops, DL); } return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, false, false, 0); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), false, false, false, 0); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", AArch64::SP) .Default(0); if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Return LR, which contains the return address. Mark it an implicit live-in. unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which // is "undef". We wanted 0, so CSEL it directly. SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETEQ, dl, DAG); SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); HiBitsForLo = DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), HiBitsForLo, CCVal, Cmp); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue LoForNormalShift = DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue HiForBigShift = Opc == ISD::SRA ? DAG.getNode(Opc, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i64)) : DAG.getConstant(0, dl, VT); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which // is "undef". We wanted 0, so CSEL it directly. SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETEQ, dl, DAG); SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); LoBitsForHi = DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), LoBitsForHi, CCVal, Cmp); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue HiForNormalShift = DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. SDValue LoForBigShift = DAG.getConstant(0, dl, VT); SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } bool AArch64TargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // The AArch64 target doesn't support folding offsets into global addresses. return false; } bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. // FIXME: We should be able to handle f128 as well with a clever lowering. if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) return true; if (VT == MVT::f64) return AArch64_AM::getFP64Imm(Imm) != -1; else if (VT == MVT::f32) return AArch64_AM::getFP32Imm(Imm) != -1; return false; } //===----------------------------------------------------------------------===// // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// // Table of Constraints // TODO: This is the current set of constraints supported by ARM for the // compiler, not all of them may make sense, e.g. S may be difficult to support. // // r - A general register // w - An FP/SIMD register of some size in the range v0-v31 // x - An FP/SIMD register of some size in the range v0-v15 // I - Constant that can be used with an ADD instruction // J - Constant that can be used with a SUB instruction // K - Constant that can be used with a 32-bit logical instruction // L - Constant that can be used with a 64-bit logical instruction // M - Constant that can be used as a 32-bit MOV immediate // N - Constant that can be used as a 64-bit MOV immediate // Q - A memory reference with base register and no offset // S - A symbolic address // Y - Floating point constant zero // Z - Integer constant zero // // Note that general register operands will be output using their 64-bit x // register name, whatever the size of the variable, unless the asm operand // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'z': return C_Other; case 'x': case 'w': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight AArch64TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'x': case 'w': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; } return weight; } std::pair AArch64TargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); case 'w': if (VT == MVT::f32) return std::make_pair(0U, &AArch64::FPR32RegClass); if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::FPR64RegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128RegClass); break; // The instructions that this constraint is designed for can // only take 128-bit registers so just use that regclass. case 'x': if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { unsigned Size = Constraint.size(); if ((Size == 4 || Size == 5) && Constraint[0] == '{' && tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { // v0 - v31 are aliases of q0 - q31. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. Res.first = AArch64::FPR128RegClass.getRegister(RegNo); Res.second = &AArch64::FPR128RegClass; } } } return Res; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void AArch64TargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; // This set of constraints deal with valid constants for various instructions. // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) Result = DAG.getRegister(AArch64::XZR, MVT::i64); else Result = DAG.getRegister(AArch64::WZR, MVT::i32); break; } case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': ConstantSDNode *C = dyn_cast(Op); if (!C) return; // Grab the value and do some validation. uint64_t CVal = C->getZExtValue(); switch (ConstraintLetter) { // The I constraint applies only to simple ADD or SUB immediate operands: // i.e. 0 to 4095 with optional shift by 12 // The J constraint applies only to ADD or SUB immediates that would be // valid when negated, i.e. if [an add pattern] were to be output as a SUB // instruction [or vice versa], in other words -1 to -4095 with optional // left shift by 12. case 'I': if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) break; return; case 'J': { uint64_t NVal = -C->getSExtValue(); if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { CVal = C->getSExtValue(); break; } return; } // The K and L constraints apply *only* to logical immediates, including // what used to be the MOVI alias for ORR (though the MOVI alias has now // been removed and MOV should be used). So these constraints have to // distinguish between bit patterns that are valid 32-bit or 64-bit // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice // versa. case 'K': if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; return; case 'L': if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; return; // The M and N constraints are a superset of K and L respectively, for use // with the MOV (immediate) alias. As well as the logical immediates they // also match 32 or 64-bit immediates that can be loaded either using a // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca // (M) or 64-bit 0x1234000000000000 (N) etc. // As a note some of this code is liberally stolen from the asm parser. case 'M': { if (!isUInt<32>(CVal)) return; if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; if ((CVal & 0xFFFF) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; uint64_t NCVal = ~(uint32_t)CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; return; } case 'N': { if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; if ((CVal & 0xFFFFULL) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; if ((CVal & 0xFFFF00000000ULL) == CVal) break; if ((CVal & 0xFFFF000000000000ULL) == CVal) break; uint64_t NCVal = ~CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; if ((NCVal & 0xFFFF00000000ULL) == NCVal) break; if ((NCVal & 0xFFFF000000000000ULL) == NCVal) break; return; } default: return; } // All assembler immediates are 64-bit integers. Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } //===----------------------------------------------------------------------===// // AArch64 Advanced SIMD Support //===----------------------------------------------------------------------===// /// WidenVector - Given a value in the V64 register class, produce the /// equivalent value in the V128 register class. static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { EVT VT = V64Reg.getValueType(); unsigned NarrowSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); SDLoc DL(V64Reg); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), V64Reg, DAG.getConstant(0, DL, MVT::i32)); } /// getExtFactor - Determine the adjustment factor for the position when /// generating an "extract from vector registers" instruction. static unsigned getExtFactor(SDValue &V) { EVT EltType = V.getValueType().getVectorElementType(); return EltType.getSizeInBits() / 8; } /// NarrowVector - Given a value in the V128 register class, produce the /// equivalent value in the V64 register class. static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { EVT VT = V128Reg.getValueType(); unsigned WideSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); SDLoc DL(V128Reg); return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt; unsigned MaxElt; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase; int WindowScale; bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } ShuffleSourceInfo(SDValue Vec) : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors // are involved. if (Sources.size() > 2) return SDValue(); // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) { SmallestEltTy = SrcEltTy; } } unsigned ResMultiplier = VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final sanity check before we try to actually produce a shuffle. DEBUG( for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT); ); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.getOpcode() == ISD::UNDEF) continue; auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getVectorElementType().getSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) return SDValue(); SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], &Mask[0]); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are different. static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, unsigned &Imm) { // Look for the first non-undef element. const int *FirstRealElt = std::find_if(M.begin(), M.end(), [](int Elt) {return Elt >= 0;}); // Benefit form APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); // The following shuffle indices must be the successive elements after the // first real element. const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); if (FirstWrongElt != M.end()) return false; // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. Imm = ExpectedElt.getZExtValue(); // There are two difference cases requiring to reverse input vectors. // For example, for vector <4 x i32> we have the following cases, // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) // For both cases, we finally use mask <5, 6, 7, 0>, which requires // to reverse two input vectors. if (Imm < NumElts) ReverseEXT = true; else Imm -= NumElts; return true; } /// isREVMask - Check if a vector shuffle corresponds to a REV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && "Only possible block sizes for REV are: 16, 32, 64"); unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) return false; } return true; } static bool isZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) return false; Idx += 1; } return true; } static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i != NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != 2 * i + WhichResult) return false; } return true; } static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) return false; } return true; } /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) return false; Idx += 1; } return true; } /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned Half = VT.getVectorNumElements() / 2; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned j = 0; j != 2; ++j) { unsigned Idx = WhichResult; for (unsigned i = 0; i != Half; ++i) { int MIdx = M[i + j * Half]; if (MIdx >= 0 && (unsigned)MIdx != Idx) return false; Idx += 2; } } return true; } /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) return false; } return true; } static bool isINSMask(ArrayRef M, int NumInputElements, bool &DstIsLeft, int &Anomaly) { if (M.size() != static_cast(NumInputElements)) return false; int NumLHSMatch = 0, NumRHSMatch = 0; int LastLHSMismatch = -1, LastRHSMismatch = -1; for (int i = 0; i < NumInputElements; ++i) { if (M[i] == -1) { ++NumLHSMatch; ++NumRHSMatch; continue; } if (M[i] == i) ++NumLHSMatch; else LastLHSMismatch = i; if (M[i] == i + NumInputElements) ++NumRHSMatch; else LastRHSMismatch = i; } if (NumLHSMatch == NumInputElements - 1) { DstIsLeft = true; Anomaly = LastLHSMismatch; return true; } else if (NumRHSMatch == NumInputElements - 1) { DstIsLeft = false; Anomaly = LastRHSMismatch; return true; } return false; } static bool isConcatMask(ArrayRef Mask, EVT VT, bool SplitLHS) { if (VT.getSizeInBits() != 128) return false; unsigned NumElts = VT.getVectorNumElements(); for (int I = 0, E = NumElts / 2; I != E; I++) { if (Mask[I] != I) return false; } int Offset = NumElts / 2; for (int I = NumElts / 2, E = NumElts; I != E; I++) { if (Mask[I] != I + SplitLHS * Offset) return false; } return true; } static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue V0 = Op.getOperand(0); SDValue V1 = Op.getOperand(1); ArrayRef Mask = cast(Op)->getMask(); if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || VT.getVectorElementType() != V1.getValueType().getVectorElementType()) return SDValue(); bool SplitV0 = V0.getValueType().getSizeInBits() == 128; if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), VT.getVectorNumElements() / 2); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); } if (V1.getValueType().getSizeInBits() == 128) { V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, DAG.getConstant(0, DL, MVT::i64)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, SDLoc dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR // VTRN, right result }; if (OpNum == OP_COPY) { if (LHSID == (1 * 9 + 2) * 9 + 3) return LHS; assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::f16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: { EVT EltTy = VT.getVectorElementType(); unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; else if (EltTy == MVT::i16 || EltTy == MVT::f16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; else if (EltTy == MVT::i64 || EltTy == MVT::f64) Opcode = AArch64ISD::DUPLANE64; else llvm_unreachable("Invalid vector element type?"); if (VT.getSizeInBits() == 64) OpLHS = WidenVector(OpLHS, DAG); SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); } case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: { unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(Imm, dl, MVT::i32)); } case OP_VUZPL: return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VUZPR: return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VZIPL: return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VZIPR: return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VTRNL: return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VTRNR: return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); } } static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the TBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); EVT EltVT = Op.getValueType().getVectorElementType(); unsigned BytesPerElt = EltVT.getSizeInBits() / 8; SmallVector TBLMask; for (int Val : ShuffleMask) { for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); } } MVT IndexVT = MVT::v8i8; unsigned IndexLen = 8; if (Op.getValueType().getSizeInBits() == 128) { IndexVT = MVT::v16i8; IndexLen = 16; } SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; if (V2.getNode()->getOpcode() == ISD::UNDEF) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, makeArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, makeArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, // &TBLMask[0], IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, V2Cst, DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, makeArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); } static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; if (EltType == MVT::i16 || EltType == MVT::f16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; if (EltType == MVT::i64 || EltType == MVT::f64) return AArch64ISD::DUPLANE64; llvm_unreachable("Invalid vector element type?"); } SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. ArrayRef ShuffleMask = SVN->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], V1.getValueType().getSimpleVT())) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), V1.getOperand(0)); // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- // constant. If so, we can just reference the lane's definition directly. if (V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(Lane))) return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); // SelectionDAGBuilder may have "helpfully" already extracted or conatenated // to make a vector of the same size as this SHUFFLE. We can ignore the // extract entirely, and canonicalise the concat using WidenVector. if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { Lane += cast(V1.getOperand(1))->getZExtValue(); V1 = V1.getOperand(0); } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; Lane -= Idx * VT.getVectorNumElements() / 2; V1 = WidenVector(V1.getOperand(Idx), DAG); } else if (VT.getSizeInBits() == 64) V1 = WidenVector(V1, DAG); return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); } if (isREVMask(ShuffleMask, VT, 64)) return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 32)) return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 16)) return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); bool ReverseEXT = false; unsigned Imm; if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { if (ReverseEXT) std::swap(V1, V2); Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } else if (V2->getOpcode() == ISD::UNDEF && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } unsigned WhichResult; if (isZIPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isUZPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isTRNMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } SDValue Concat = tryFormConcatFromShuffle(Op, DAG); if (Concat.getNode()) return Concat; bool DstIsLeft; int Anomaly; int NumInputElements = V1.getValueType().getVectorNumElements(); if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { SDValue DstVec = DstIsLeft ? V1 : V2; SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); SDValue SrcVec = V1; int SrcLane = ShuffleMask[Anomaly]; if (SrcLane >= NumInputElements) { SrcVec = V2; SrcLane -= VT.getVectorNumElements(); } SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), DstLaneV); } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } return GenerateTBL(Op, ShuffleMask, DAG); } static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; for (unsigned i = 0; i < NumSplats; ++i) { CnstBits <<= SplatBitSize; UndefBits <<= SplatBitSize; CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); } return true; } return false; } SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SelectionDAG &DAG) const { BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1).getNode()); SDValue LHS = Op.getOperand(0); SDLoc dl(Op); EVT VT = Op.getValueType(); if (!BVN) return Op; APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, CnstBits, UndefBits)) { // We only have BIC vector immediate instruction, which is and-not. CnstBits = ~CnstBits; // We make use of a little bit of goto ickiness in order to avoid having to // duplicate the immediate matching logic for the undef toggled case. bool SecondTry = false; AttemptModImm: if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { CnstBits = CnstBits.zextOrTrunc(64); uint64_t CnstVal = CnstBits.getZExtValue(); if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } if (SecondTry) goto FailedModImm; SecondTry = true; CnstBits = ~UndefBits; goto AttemptModImm; } // We can always fall back to a non-immediate AND. FailedModImm: return Op; } // Specialized code to quickly find if PotentialBVec is a BuildVector that // consists of only the same constant int value, returned in reference arg // ConstVal static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal) { BuildVectorSDNode *Bvec = dyn_cast(PotentialBVec); if (!Bvec) return false; ConstantSDNode *FirstElt = dyn_cast(Bvec->getOperand(0)); if (!FirstElt) return false; EVT VT = Bvec->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 1; i < NumElts; ++i) if (dyn_cast(Bvec->getOperand(i)) != FirstElt) return false; ConstVal = FirstElt->getZExtValue(); return true; } static unsigned getIntrinsicID(const SDNode *N) { unsigned Opcode = N->getOpcode(); switch (Opcode) { default: return Intrinsic::not_intrinsic; case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); if (IID < Intrinsic::num_intrinsics) return IID; return Intrinsic::not_intrinsic; } } } // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. // Also, logical shift right -> sri, with the same structure. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDLoc DL(N); // Is the first op an AND? const SDValue And = N->getOperand(0); if (And.getOpcode() != ISD::AND) return SDValue(); // Is the second op an shl or lshr? SDValue Shift = N->getOperand(1); // This will have been turned into: AArch64ISD::VSHL vector, #shift // or AArch64ISD::VLSHR vector, #shift unsigned ShiftOpc = Shift.getOpcode(); if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) return SDValue(); bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); if (!C2node) return SDValue(); // Is the and mask vector all constant? uint64_t C1; if (!isAllConstantBuildVector(And.getOperand(1), C1)) return SDValue(); // Is C1 == ~C2, taking into account how much one can shift elements of a // particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); unsigned ElemMask = (1 << ElemSizeInBits) - 1; if ((C1 & ElemMask) != (~C2 & ElemMask)) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); unsigned Intrin = IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; SDValue ResultSLI = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrin, DL, MVT::i32), X, Y, Shift.getOperand(1)); DEBUG(dbgs() << "aarch64-lower: transformed: \n"); DEBUG(N->dump(&DAG)); DEBUG(dbgs() << "into: \n"); DEBUG(ResultSLI->dump(&DAG)); ++NumShiftInserts; return ResultSLI; } SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (EnableAArch64SlrGeneration) { SDValue Res = tryLowerToSLI(Op.getNode(), DAG); if (Res.getNode()) return Res; } BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(0).getNode()); SDValue LHS = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); // OR commutes, so try swapping the operands. if (!BVN) { LHS = Op.getOperand(0); BVN = dyn_cast(Op.getOperand(1).getNode()); } if (!BVN) return Op; APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, CnstBits, UndefBits)) { // We make use of a little bit of goto ickiness in order to avoid having to // duplicate the immediate matching logic for the undef toggled case. bool SecondTry = false; AttemptModImm: if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { CnstBits = CnstBits.zextOrTrunc(64); uint64_t CnstVal = CnstBits.getZExtValue(); if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } if (SecondTry) goto FailedModImm; SecondTry = true; CnstBits = UndefBits; goto AttemptModImm; } // We can always fall back to a non-immediate OR. FailedModImm: return Op; } // Normalize the operands of BUILD_VECTOR. The value of constant operands will // be truncated to fit element width. static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT EltTy= VT.getVectorElementType(); if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) return Op; SmallVector Ops; for (SDValue Lane : Op->ops()) { if (auto *CstLane = dyn_cast(Lane)) { APInt LowBits(EltTy.getSizeInBits(), CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); Op = NormalizeBuildVector(Op, DAG); BuildVectorSDNode *BVN = cast(Op.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, CnstBits, UndefBits)) { // We make use of a little bit of goto ickiness in order to avoid having to // duplicate the immediate matching logic for the undef toggled case. bool SecondTry = false; AttemptModImm: if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { CnstBits = CnstBits.zextOrTrunc(64); uint64_t CnstVal = CnstBits.getZExtValue(); // Certain magic vector constants (used to express things like NOT // and NEG) are passed through unmodified. This allows codegen patterns // for these operations to match. Special-purpose patterns will lower // these immediates to MOVIs if it proves necessary. if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) return Op; // The many faces of MOVI... if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); if (VT.getSizeInBits() == 128) { SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // Support the V64 version via subregister insertion. SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(264, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(272, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // The few faces of FMOV... if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && VT.getSizeInBits() == 128) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, DAG.getConstant(CnstVal, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // The many faces of MVNI... CnstVal = ~CnstVal; if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(24, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(8, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(264, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, DAG.getConstant(CnstVal, dl, MVT::i32), DAG.getConstant(272, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } if (SecondTry) goto FailedModImm; SecondTry = true; CnstBits = UndefBits; goto AttemptModImm; } FailedModImm: // Scan through the operands to find some interesting properties we can // exploit: // 1) If only one value is used, we can use a DUP, or // 2) if only the low element is not undef, we can just insert that, or // 3) if only one constant value is used (w/ some non-constant lanes), // we can splat the constant value into the whole vector then fill // in the non-constant lanes. // 4) FIXME: If different constant values are used, but we can intelligently // select the values we'll be overwriting for the non-constant // lanes such that we can directly materialize the vector // some other way (MOVI, e.g.), we can be sneaky. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool usesOnlyOneConstantValue = true; bool isConstant = true; unsigned NumConstantLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) isConstant = false; if (isa(V) || isa(V)) { ++NumConstantLanes; if (!ConstantValue.getNode()) ConstantValue = V; else if (ConstantValue != V) usesOnlyOneConstantValue = false; } if (!Value.getNode()) Value = V; else if (V != Value) usesOnlyOneValue = false; } if (!Value.getNode()) return DAG.getUNDEF(VT); if (isOnlyLowElement) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); // Use DUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (usesOnlyOneValue) { if (!isConstant) { if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Value.getValueType() != VT) return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); // This is actually a DUPLANExx operation, which keeps everything vectory. // DUPLANE works on 128-bit vectors, widen it if necessary. SDValue Lane = Value.getOperand(1); Value = Value.getOperand(0); if (Value.getValueType().getSizeInBits() == 64) Value = WidenVector(Value, DAG); unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); return DAG.getNode(Opcode, dl, VT, Value, Lane); } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; EVT EltTy = VT.getVectorElementType(); assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && "Unsupported floating-point vector type"); MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } } // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); // Now insert the non-constant lanes. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); if (!isa(V) && !isa(V)) { // Note that type legalization likely mucked about with the VT of the // source operand, so we may have to convert it here before inserting. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); } } return Val; } // If all elements are constants and the case above didn't get hit, fall back // to the default expansion, which will generate a load from the constant // pool. if (isConstant) return SDValue(); // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); unsigned i = 0; // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register. // Do not do this for UNDEF/LOAD nodes because we have better patterns // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, DAG.getTargetConstant(SubIdx, dl, MVT::i32)); Vec = SDValue(N, 0); ++i; } for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } // Just use the default expansion. We failed to find a better alternative. return SDValue(); } SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform insertion by expanding the value // to a V128 type and perform the insertion on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, Op.getOperand(1), Op.getOperand(2)); // Re-narrow the resultant vector. return NarrowVector(Node, DAG); } SDValue AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast(Op.getOperand(1)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform extraction by expanding the value // to a V128 type and perform the extraction on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); EVT ExtrTy = WideTy.getVectorElementType(); if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) ExtrTy = MVT::i32; // For extractions, we just return the result directly. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getOperand(0).getValueType(); SDLoc dl(Op); // Just in case... if (!VT.isVector()) return SDValue(); ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); if (!Cst) return SDValue(); unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueType().getSizeInBits(); // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. if (Val == 0) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) return Op; return SDValue(); } bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (M[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = M[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return true; } bool DummyBool; int DummyInt; unsigned DummyUnsigned; return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || isZIPMask(M, VT, DummyUnsigned) || isTRN_v_undef_Mask(M, VT, DummyUnsigned) || isUZP_v_undef_Mask(M, VT, DummyUnsigned) || isZIP_v_undef_Mask(M, VT, DummyUnsigned) || isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || isConcatMask(M, VT, VT.getSizeInBits() == 128)); } /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. The value must be in the range: /// 1 <= Value <= ElementBits for a right shift; or static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); int64_t Cnt; if (!Op.getOperand(1).getValueType().isVector()) return Op; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); switch (Op.getOpcode()) { default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); } // Right shift register. Note, there is not a shift right register // instruction, but the shift left register instruction takes a signed // value, where negative numbers specify a right shift. unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl : Intrinsic::aarch64_neon_ushl; // negate the shift amount SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), NegShift); return NegShiftLeft; } return SDValue(); } static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, SDLoc dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); bool IsZero = IsCnst && (CnstBits == 0); if (SrcVT.getVectorElementType().isFloatingPoint()) { switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Fcmeq; if (IsZero) Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); else Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); case AArch64CC::LS: if (IsZero) return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); case AArch64CC::LT: if (!NoNans) return SDValue(); // If we ignore NaNs then we can use to the MI implementation. // Fallthrough. case AArch64CC::MI: if (IsZero) return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); } } switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Cmeq; if (IsZero) Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); else Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); case AArch64CC::LE: if (IsZero) return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); case AArch64CC::LS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); case AArch64CC::LO: return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); case AArch64CC::LT: if (IsZero) return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); case AArch64CC::HI: return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); case AArch64CC::HS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); } } SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); if (LHS.getValueType().getVectorElementType().isInteger()) { assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); SDValue Cmp = EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } assert(LHS.getValueType().getVectorElementType() == MVT::f32 || LHS.getValueType().getVectorElementType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. AArch64CC::CondCode CC1, CC2; bool ShouldInvert; changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; SDValue Cmp = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); } Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); if (ShouldInvert) return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); return Cmp; } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; Info.vol = false; // volatile loads with NEON intrinsics not supported Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; Info.vol = false; // volatile stores with NEON intrinsics not supported Info.readMem = false; Info.writeMem = true; return true; } case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; return true; } case Intrinsic::aarch64_ldaxp: case Intrinsic::aarch64_ldxp: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 16; Info.vol = true; Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = 16; Info.vol = true; Info.readMem = false; Info.writeMem = true; return true; } default: break; } return false; } // Truncations from 64-bit GPR to 32-bit GPR is free. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } /// Check if it is profitable to hoist instruction in then/else to if. /// Not profitable if I and it's user can form a FMA instruction /// because we prefer FMSUB/FMADD. bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; if (I->getNumUses() != 1) return true; Instruction *User = I->user_back(); if (User && !(User->getOpcode() == Instruction::FSub || User->getOpcode() == Instruction::FAdd)) return true; const TargetOptions &Options = getTargetMachine().Options; const DataLayout &DL = I->getModule()->getDataLayout(); EVT VT = getValueType(DL, User->getOperand(0)->getType()); if (isFMAFasterThanFMulAndFAdd(VT) && isOperationLegalOrCustom(ISD::FMA, VT) && (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) return false; return true; } // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) { return true; } if (Val.getOpcode() != ISD::LOAD) return false; // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && VT1.getSizeInBits() <= 32); } bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { if (isa(Ext)) return false; // Vector types are next free. if (Ext->getType()->isVectorTy()) return false; for (const Use &U : Ext->uses()) { // The extension is free if we can fold it with a left shift in an // addressing mode or an arithmetic operation: add, sub, and cmp. // Is there a shift? const Instruction *Instr = cast(U.getUser()); // Is this a constant shift? switch (Instr->getOpcode()) { case Instruction::Shl: if (!isa(Instr->getOperand(1))) return false; break; case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); auto &DL = Ext->getModule()->getDataLayout(); std::advance(GTI, U.getOperandNo()); Type *IdxTy = *GTI; // This extension will end up with a shift because of the scaling factor. // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) return false; break; } case Instruction::Trunc: // Check if this is a noop. // trunc(sext ty1 to ty2) to ty1. if (Instr->getType() == Ext->getOperand(0)->getType()) continue; // FALL THROUGH. default: return false; } // At this point we can use the bfm family, so this extension is free // for that use. } return true; } bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, unsigned &RequiredAligment) const { if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) return false; // Cyclone supports unaligned accesses. RequiredAligment = 0; unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); return NumBits == 32 || NumBits == 64; } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. RequiredAligment = 0; unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } /// \brief Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); unsigned VecSize = DL.getTypeSizeInBits(VecTy); // Skip if we do not have NEON and skip illegal vector types. if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); IRBuilder<> Builder(LI); Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SVI = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(LdN, Index); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); SVI->replaceAllUsesWith(SubVec); } return true; } /// \brief Get a mask consisting of sequential integers starting from \p Start. /// /// I.e. static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, unsigned NumElts) { SmallVector Mask; for (unsigned i = 0; i < NumElts; i++) Mask.push_back(Builder.getInt32(Start + i)); return ConstantVector::get(Mask); } /// \brief Lower an interleaved store into a stN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); VectorType *VecTy = SVI->getType(); assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // Skip if we do not have NEON and skip illegal vector types. if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) return false; Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); unsigned NumOpElts = dyn_cast(Op0->getType())->getVectorNumElements(); // Convert to the corresponding integer vector. Type *IntVecTy = VectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = VectorType::get(IntTy, NumSubElts); } Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, Intrinsic::aarch64_neon_st3, Intrinsic::aarch64_neon_st4}; Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); SmallVector Ops; // Split the shufflevector operands into sub vectors for the new stN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); Builder.CreateCall(StNFunc, Ops); return true; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && (DstAlign == 0 || DstAlign % AlignCheck == 0)); } EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { // Don't use AdvSIMD to implement 16-byte memset. It would have taken one // instruction to materialize the v2i64 zero and one store (with restrictive // addressing mode). Just do two i64 store of zero-registers. bool Fast; const Function *F = MF.getFunction(); if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && !F->hasFnAttribute(Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) return MVT::i64; if (Size >= 4 && (memOpAlign(SrcAlign, DstAlign, 4) || (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) return MVT::i32; return MVT::Other; } // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) return true; return false; } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { if (Immed < 0) Immed *= -1; return isLegalAddImmediate(Immed); } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset // reg + SIZE_IN_BYTES * 12-bit unsigned offset // reg1 + reg2 // reg + SIZE_IN_BYTES * reg // No global is ever allowed as a base. if (AM.BaseGV) return false; // No reg+reg+imm addressing. if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) return false; // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; if (Ty->isSized()) { uint64_t NumBits = DL.getTypeSizeInBits(Ty); NumBytes = NumBits / 8; if (!isPowerOf2_64(NumBits)) NumBytes = 0; } if (!AM.Scale) { int64_t Offset = AM.BaseOffs; // 9-bit signed offset if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) return true; // 12-bit unsigned offset unsigned shift = Log2_64(NumBytes); if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && // Must be a multiple of NumBytes (NumBytes is a power of 2) (Offset >> shift) << shift == Offset) return true; return false; } // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 if (!AM.Scale || AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) return true; return false; } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // Operands | Rt Latency // ------------------------------------------- // Rt, [Xn, Xm] | 4 // ------------------------------------------- // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 // Rt, [Xn, Wm, #imm] | if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 if // it is not equal to 0 or 1. return AM.Scale != 0 && AM.Scale != 1; return -1; } bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } const MCPhysReg * AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call // site. Hence we include LR in the scratch registers, which are in turn added // as implicit-defs for stackmaps and patchpoints. static const MCPhysReg ScratchRegs[] = { AArch64::X16, AArch64::X17, AArch64::LR, 0 }; return ScratchRegs; } bool AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { EVT VT = N->getValueType(0); // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine // it with shift to let it be lowered to UBFX. if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && isa(N->getOperand(1))) { uint64_t TruncMask = N->getConstantOperandVal(1); if (isMask_64(TruncMask) && N->getOperand(0).getOpcode() == ISD::SRL && isa(N->getOperand(0)->getOperand(1))) return false; } return true; } bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0) return false; int64_t Val = Imm.getSExtValue(); if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) return true; if ((int64_t)Val < 0) Val = ~Val; if (BitSize == 32) Val &= (1LL << 32) - 1; unsigned LZ = countLeadingZeros((uint64_t)Val); unsigned Shift = (63 - LZ) / 16; // MOVZ is free so return true for one or fewer MOVK. return Shift < 3; } // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CSEL. if (VT.isInteger() && N->getOpcode() == ISD::XOR && N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0.getOperand(0)); // Generate SUBS & CSEL. SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), N0.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, DAG.getConstant(AArch64CC::PL, DL, MVT::i32), SDValue(Cmp.getNode(), 1)); } return SDValue(); } // performXorCombine - Attempts to handle integer ABS. static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); return performIntegerAbsCombine(N, DAG); } SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const { // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countTrailingZeros(); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); // Add (N0 < 0) ? Pow2 - 1 : 0; SDValue CCVal; SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); if (Created) { Created->push_back(Cmp.getNode()); Created->push_back(Add.getNode()); Created->push_back(CSel.getNode()); } // Divide by pow2. SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (Divisor.isNonNegative()) return SRA; if (Created) Created->push_back(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { APInt Value = C->getAPIntValue(); EVT VT = N->getValueType(0); SDLoc DL(N); if (Value.isNonNegative()) { // (mul x, 2^N + 1) => (add (shl x, N), x) APInt VM1 = Value - 1; if (VM1.isPowerOf2()) { SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); } // (mul x, 2^N - 1) => (sub (shl x, N), x) APInt VP1 = Value + 1; if (VP1.isPowerOf2()) { SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(VP1.logBase2(), DL, MVT::i64)); return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal, N->getOperand(0)); } } else { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) APInt VNP1 = -Value + 1; if (VNP1.isPowerOf2()) { SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(VNP1.logBase2(), DL, MVT::i64)); return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), ShiftedVal); } // (mul x, -(2^N + 1)) => - (add (shl x, N), x) APInt VNM1 = -Value - 1; if (VNM1.isPowerOf2()) { SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(VNM1.logBase2(), DL, MVT::i64)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add); } } } return SDValue(); } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); return Res; } return SDValue(); } static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Only optimize when the source and destination types have the same width. if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) return SDValue(); // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), LN0->getAlignment()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); unsigned Opcode = (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; return DAG.getNode(Opcode, SDLoc(N), VT, Load); } return SDValue(); } /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., float -> i64). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t Bits = IntBits == 64 ? 64 : 32; int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); if (C == -1 || C == 0 || C > Bits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = MVT::v4i32; break; } SDLoc DL(N); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); return FixConv; } /// Fold a floating-point divide by power of two into fixed-point to /// floating-point conversion. static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned Opc = Op->getOpcode(); if (!Op.getValueType().isVector() || (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); int32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); int32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., i64 -> float). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); if (C == -1 || C == 0 || C > FloatBits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = MVT::v4i32; break; } SDLoc DL(N); SDValue ConvInput = Op.getOperand(0); bool IsSigned = Opc == ISD::SINT_TO_FP; if (IntBits < FloatBits) ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, ResTy, ConvInput); unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp : Intrinsic::aarch64_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, DAG.getConstant(C, DL, MVT::i32)); } /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi) { if (N.getOpcode() == ISD::SHL) FromHi = false; else if (N.getOpcode() == ISD::SRL) FromHi = true; else return false; if (!isa(N.getOperand(1))) return false; ShiftAmount = N->getConstantOperandVal(1); Src = N->getOperand(0); return true; } /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an /// EXTR. Can't quite be done in TableGen because the two immediates aren't /// independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); SDValue LHS; uint32_t ShiftLHS = 0; bool LHSFromHi = 0; if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) return SDValue(); SDValue RHS; uint32_t ShiftRHS = 0; bool RHSFromHi = 0; if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) return SDValue(); // If they're both trying to come from the high part of the register, they're // not really an EXTR. if (LHSFromHi == RHSFromHi) return SDValue(); if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) return SDValue(); if (LHSFromHi) { std::swap(LHS, RHS); std::swap(ShiftLHS, ShiftRHS); } return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, DAG.getConstant(ShiftRHS, DL, MVT::i64)); } static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); if (!VT.isVector()) return SDValue(); SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::AND) return SDValue(); SDValue N1 = N->getOperand(1); if (N1.getOpcode() != ISD::AND) return SDValue(); // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. unsigned Bits = VT.getVectorElementType().getSizeInBits(); uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); for (int i = 1; i >= 0; --i) for (int j = 1; j >= 0; --j) { BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(i)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(j)); if (!BVN0 || !BVN1) continue; bool FoundMatch = true; for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); if (!CN0 || !CN1 || CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { FoundMatch = false; break; } } if (FoundMatch) return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } return SDValue(); } static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) if (!EnableAArch64ExtrGeneration) return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDValue Res = tryCombineToEXTR(N, DCI); if (Res.getNode()) return Res; Res = tryCombineToBSL(N, DCI); if (Res.getNode()) return Res; return SDValue(); } static SDValue performBitcastCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Remove extraneous bitcasts around an extract_subvector. // For example, // (v4i16 (bitconvert // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) // becomes // (extract_subvector ((v8i16 ...), (i64 4))) // Only interested in 64-bit vectors as the ultimate result. EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); if (VT.getSimpleVT().getSizeInBits() != 64) return SDValue(); // Is the operand an extract_subvector starting at the beginning or halfway // point of the vector? A low half may also come through as an // EXTRACT_SUBREG, so look for that, too. SDValue Op0 = N->getOperand(0); if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && !(Op0->isMachineOpcode() && Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) return SDValue(); uint64_t idx = cast(Op0->getOperand(1))->getZExtValue(); if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) return SDValue(); } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { if (idx != AArch64::dsub) return SDValue(); // The dsub reference is equivalent to a lane zero subvector reference. idx = 0; } // Look through the bitcast of the input to the extract. if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) return SDValue(); SDValue Source = Op0->getOperand(0)->getOperand(0); // If the source type has twice the number of elements as our destination // type, we know this is an extract of the high or low half of the vector. EVT SVT = Source->getValueType(0); if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) return SDValue(); DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); // Create the simplified form to just extract the low or high half of the // vector directly rather than bothering with the bitcasts. SDLoc dl(N); unsigned NumElements = VT.getVectorNumElements(); if (idx) { SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); } else { SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, Source, SubReg), 0); } } static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), // (v2i16 (truncate (v2i64))))) // -> // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), // (v4i32 (bitcast (v2i64))), // <0, 2, 4, 6>))) // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. if (N->getNumOperands() == 2 && N0->getOpcode() == ISD::TRUNCATE && N1->getOpcode() == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); if (N00VT == N10.getValueType() && (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); SmallVector Mask(MidVT.getVectorNumElements()); for (size_t i = 0; i < Mask.size(); ++i) Mask[i] = i * 2; return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getVectorShuffle( MidVT, dl, DAG.getNode(ISD::BITCAST, dl, MidVT, N00), DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); } } // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. if (N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getVectorElementType().getSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); } // Canonicalise concat_vectors so that the right-hand vector has as few // bit-casts as possible before its real operation. The primary matching // destination for these operations will be the narrowing "2" instructions, // which depend on the operation being performed on this right-hand vector. // For example, // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) if (N1->getOpcode() != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); // If the RHS is not a vector, this is not the pattern we're looking for. if (!RHSTy.isVector()) return SDValue(); DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), RHSTy.getVectorNumElements() * 2); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), RHS)); } static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Transform a scalar conversion of a value from a lane extract into a // lane extract of a vector conversion. E.g., from foo1 to foo2: // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } // // The second form interacts better with instruction selection and the // register allocator to avoid cross-class register copies that aren't // coalescable due to a lane reference. // Check the operand and see if it originates from a lane extract. SDValue Op1 = N->getOperand(1); if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { // Yep, no additional predication needed. Perform the transform. SDValue IID = N->getOperand(0); SDValue Shift = N->getOperand(2); SDValue Vec = Op1.getOperand(0); SDValue Lane = Op1.getOperand(1); EVT ResTy = N->getValueType(0); EVT VecResTy; SDLoc DL(N); // The vector width should be 128 bits by the time we get here, even // if it started as 64 bits (the extract_vector handling will have // done so). assert(Vec.getValueType().getSizeInBits() == 128 && "unexpected vector size on extract_vector_elt!"); if (Vec.getValueType() == MVT::v4i32) VecResTy = MVT::v4f32; else if (Vec.getValueType() == MVT::v2i64) VecResTy = MVT::v2f64; else llvm_unreachable("unexpected vector type!"); SDValue Convert = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); } return SDValue(); } // AArch64 high-vector "long" operations are formed by performing the non-high // version on an extract_subvector of each operand which gets the high half: // // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) // // However, there are cases which don't have an extract_high explicitly, but // have another operation that can be made compatible with one for free. For // example: // // (dupv64 scalar) --> (extract_high (dup128 scalar)) // // This routine does the actual conversion of such DUPs, once outer routines // have determined that everything else is in order. // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold // similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { switch (N.getOpcode()) { case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: case AArch64ISD::MOVI: case AArch64ISD::MOVIshift: case AArch64ISD::MOVIedit: case AArch64ISD::MOVImsl: case AArch64ISD::MVNIshift: case AArch64ISD::MVNImsl: break; default: // FMOV could be supported, but isn't very useful, as it would only occur // if you passed a bitcast' floating point immediate to an eligible long // integer op (addl, smull, ...). return SDValue(); } MVT NarrowTy = N.getSimpleValueType(); if (!NarrowTy.is64BitVector()) return SDValue(); MVT ElementTy = NarrowTy.getVectorElementType(); unsigned NumElems = NarrowTy.getVectorNumElements(); MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), DAG.getConstant(NumElems, dl, MVT::i64)); } static bool isEssentiallyExtractSubvector(SDValue N) { if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) return true; return N.getOpcode() == ISD::BITCAST && N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; } /// \brief Helper structure to keep track of ISD::SET_CC operands. struct GenericSetCCInfo { const SDValue *Opnd0; const SDValue *Opnd1; ISD::CondCode CC; }; /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. struct AArch64SetCCInfo { const SDValue *Cmp; AArch64CC::CondCode CC; }; /// \brief Helper structure to keep track of SetCC information. union SetCCInfo { GenericSetCCInfo Generic; AArch64SetCCInfo AArch64; }; /// \brief Helper structure to be able to read SetCC information. If set to /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a /// GenericSetCCInfo. struct SetCCInfoAndKind { SetCCInfo Info; bool IsAArch64; }; /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or /// an /// AArch64 lowered one. /// \p SetCCInfo is filled accordingly. /// \post SetCCInfo is meanginfull only when this function returns true. /// \return True when Op is a kind of SET_CC operation. static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { // If this is a setcc, this is straight forward. if (Op.getOpcode() == ISD::SETCC) { SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); SetCCInfo.Info.Generic.CC = cast(Op.getOperand(2))->get(); SetCCInfo.IsAArch64 = false; return true; } // Otherwise, check if this is a matching csel instruction. // In other words: // - csel 1, 0, cc // - csel 0, 1, !cc if (Op.getOpcode() != AArch64ISD::CSEL) return false; // Set the information about the operands. // TODO: we want the operands of the Cmp not the csel SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); SetCCInfo.IsAArch64 = true; SetCCInfo.Info.AArch64.CC = static_cast( cast(Op.getOperand(2))->getZExtValue()); // Check that the operands matches the constraints: // (1) Both operands must be constants. // (2) One must be 1 and the other must be 0. ConstantSDNode *TValue = dyn_cast(Op.getOperand(0)); ConstantSDNode *FValue = dyn_cast(Op.getOperand(1)); // Check (1). if (!TValue || !FValue) return false; // Check (2). if (!TValue->isOne()) { // Update the comparison when we are interested in !cc. std::swap(TValue, FValue); SetCCInfo.Info.AArch64.CC = AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); } return TValue->isOne() && FValue->isNullValue(); } // Returns true if Op is setcc or zext of setcc. static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { if (isSetCC(Op, Info)) return true; return ((Op.getOpcode() == ISD::ZERO_EXTEND) && isSetCC(Op->getOperand(0), Info)); } // The folding we want to perform is: // (add x, [zext] (setcc cc ...) ) // --> // (csel x, (add x, 1), !cc ...) // // The latter will get matched to a CSINC instruction. static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); SDValue LHS = Op->getOperand(0); SDValue RHS = Op->getOperand(1); SetCCInfoAndKind InfoAndKind; // If neither operand is a SET_CC, give up. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { std::swap(LHS, RHS); if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) return SDValue(); } // FIXME: This could be generatized to work for FP comparisons. EVT CmpVT = InfoAndKind.IsAArch64 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() : InfoAndKind.Info.Generic.Opnd0->getValueType(); if (CmpVT != MVT::i32 && CmpVT != MVT::i64) return SDValue(); SDValue CCVal; SDValue Cmp; SDLoc dl(Op); if (InfoAndKind.IsAArch64) { CCVal = DAG.getConstant( AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, MVT::i32); Cmp = *InfoAndKind.Info.AArch64.Cmp; } else Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), CCVal, DAG, dl); EVT VT = Op->getValueType(0); LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: // // (add (zeroext (extract_high LHS)), // (zeroext (extract_high RHS))) // -> uaddl2 vD, vN, vM // // However, if one of the extracts is something like a duplicate, this // instruction can still be used profitably. This function puts the DAG into a // more appropriate form for those patterns to trigger. static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector()) { if (N->getOpcode() == ISD::ADD) return performSetccAddFolding(N, DAG); return SDValue(); } // Make sure both branches are extended in the same way. SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if ((LHS.getOpcode() != ISD::ZERO_EXTEND && LHS.getOpcode() != ISD::SIGN_EXTEND) || LHS.getOpcode() != RHS.getOpcode()) return SDValue(); unsigned ExtType = LHS.getOpcode(); // It's not worth doing if at least one of the inputs isn't already an // extract, but we don't know which it'll be so we have to try both. if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); if (!RHS.getNode()) return SDValue(); RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); if (!LHS.getNode()) return SDValue(); LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); } return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> // (aarch64_neon_umull (extract_high (v2i64 vec))) // (extract_high (v2i64 (dup128 scalar))))) // static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); assert(LHS.getValueType().is64BitVector() && RHS.getValueType().is64BitVector() && "unexpected shape for long operation"); // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". if (isEssentiallyExtractSubvector(LHS)) { RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); } else if (isEssentiallyExtractSubvector(RHS)) { LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); } return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), N->getOperand(0), LHS, RHS); } static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { MVT ElemTy = N->getSimpleValueType(0).getScalarType(); unsigned ElemBits = ElemTy.getSizeInBits(); int64_t ShiftAmount; if (BuildVectorSDNode *BVN = dyn_cast(N->getOperand(2))) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, ElemBits) || SplatBitSize != ElemBits) return SDValue(); ShiftAmount = SplatValue.getSExtValue(); } else if (ConstantSDNode *CVN = dyn_cast(N->getOperand(2))) { ShiftAmount = CVN->getSExtValue(); } else return SDValue(); unsigned Opcode; bool IsRightShift; switch (IID) { default: llvm_unreachable("Unknown shift intrinsic"); case Intrinsic::aarch64_neon_sqshl: Opcode = AArch64ISD::SQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_uqshl: Opcode = AArch64ISD::UQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_srshl: Opcode = AArch64ISD::SRSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_urshl: Opcode = AArch64ISD::URSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_sqshlu: Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { SDLoc dl(N); return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(-ShiftAmount, dl, MVT::i32)); } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { SDLoc dl(N); return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(ShiftAmount, dl, MVT::i32)); } return SDValue(); } // The CRC32[BH] instructions ignore the high bits of their data operand. Since // the intrinsics must be legal and take an i32, this means there's almost // certainly going to be a zext in the DAG which we can eliminate. static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { SDValue AndN = N->getOperand(2); if (AndN.getOpcode() != ISD::AND) return SDValue(); ConstantSDNode *CMask = dyn_cast(AndN.getOperand(1)); if (!CMask || CMask->getZExtValue() != Mask) return SDValue(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); } static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), DAG.getNode(Opc, dl, N->getOperand(1).getSimpleValueType(), N->getOperand(1)), DAG.getConstant(0, dl, MVT::i64)); } static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; unsigned IID = getIntrinsicID(N); switch (IID) { default: break; case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); case Intrinsic::aarch64_neon_sminv: return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); case Intrinsic::aarch64_neon_uminv: return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); case Intrinsic::aarch64_neon_smaxv: return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmaxnm: return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fminnm: return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: case Intrinsic::aarch64_neon_pmull: case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); case Intrinsic::aarch64_neon_sqshl: case Intrinsic::aarch64_neon_uqshl: case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); case Intrinsic::aarch64_crc32h: case Intrinsic::aarch64_crc32ch: return tryCombineCRC32(0xffff, N, DAG); } return SDValue(); } static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then // we can convert that DUP into another extract_high (of a bigger DUP), which // helps the backend to decide that an sabdl2 would be useful, saving a real // extract_high operation. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { SDNode *ABDNode = N->getOperand(0).getNode(); unsigned IID = getIntrinsicID(ABDNode); if (IID == Intrinsic::aarch64_neon_sabd || IID == Intrinsic::aarch64_neon_uabd) { SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); if (!NewABD.getNode()) return SDValue(); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } } // This is effectively a custom type legalization for AArch64. // // Type legalization will split an extend of a small, legal, type to a larger // illegal type by first splitting the destination type, often creating // illegal source types, which then get legalized in isel-confusing ways, // leading to really terrible codegen. E.g., // %result = v8i32 sext v8i8 %value // becomes // %losrc = extract_subreg %value, ... // %hisrc = extract_subreg %value, ... // %lo = v4i32 sext v4i8 %losrc // %hi = v4i32 sext v4i8 %hisrc // Things go rapidly downhill from there. // // For AArch64, the [sz]ext vector instructions can only go up one element // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 // take two instructions. // // This implies that the most efficient way to do the extend from v8i8 // to two v4i32 values is to first extend the v8i8 to v8i16, then do // the normal splitting to happen for the v8i16->v8i32. // This is pre-legalization to catch some cases where the default // type legalization will create ill-tempered code. if (!DCI.isBeforeLegalizeOps()) return SDValue(); // We're only interested in cleaning things up for non-legal vector types // here. If both the source and destination are legal, things will just // work naturally without any fiddling. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ResVT = N->getValueType(0); if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) return SDValue(); // If the vector type isn't a simple VT, it's beyond the scope of what // we're worried about here. Let legalization do its thing and hope for // the best. SDValue Src = N->getOperand(0); EVT SrcVT = Src->getValueType(0); if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); // If the source VT is a 64-bit vector, we can play games and get the // better results we want. if (SrcVT.getSizeInBits() != 64) return SDValue(); unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); unsigned ElementCount = SrcVT.getVectorNumElements(); SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); SDLoc DL(N); Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); // Now split the rest of the operation into two halves, each with a 64 // bit source. EVT LoVT, HiVT; SDValue Lo, Hi; unsigned NumElements = ResVT.getVectorNumElements(); assert(!(NumElements & 1) && "Splitting vector, but not in half!"); LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), NumElements / 2); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); // Now combine the parts back together so we still have a single result // like the combiner expects. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } /// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// value. The load store optimizer pass will merge them to store pair stores. /// This has better performance than a splat of the scalar followed by a split /// vector store. Even if the stores are not merged it is four stores vs a dup, /// followed by an ext.b and two stores. static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); // Don't replace floating point stores, they possibly won't be transformed to // stp because of the store pair suppress pass. if (VT.isFloatingPoint()) return SDValue(); // Check for insert vector elements. if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); // We can express a splat as store pair(s) for 2 or 4 elements. unsigned NumVecElts = VT.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 2) return SDValue(); SDValue SplatVal = StVal.getOperand(1); unsigned RemainInsertElts = NumVecElts - 1; // Check that this is a splat. while (--RemainInsertElts) { SDValue NextInsertElt = StVal.getOperand(0); if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); if (NextInsertElt.getOperand(1) != SplatVal) return SDValue(); StVal = NextInsertElt; } unsigned OrigAlignment = St->getAlignment(); unsigned EltOffset = NumVecElts == 4 ? 4 : 8; unsigned Alignment = std::min(OrigAlignment, EltOffset); // Create scalar stores. This is at least as good as the code sequence for a // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); unsigned Offset = EltOffset; while (--NumVecElts) { SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), Alignment); Offset += EltOffset; } return NewST1; } static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); StoreSDNode *S = cast(N); if (S->isVolatile()) return SDValue(); // FIXME: The logic for deciding if an unaligned store should be split should // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. // Cyclone has bad performance on unaligned 16B stores when crossing line and // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); // Don't split at -Oz. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); // Don't split v2i64 vectors. Memcpy lowering produces those and splitting // those up regresses performance on micro-benchmarks and olden/bh. if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) return SDValue(); // Split unaligned 16B stores. They are terrible for performance. // Don't split stores with alignment of 1 or 2. Code that uses clang vector // extensions can use this to mark that it does not want splitting to happen // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of // eliminating alignment hazards is only 1 in 8 for alignment of 2. if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || S->getAlignment() <= 2) return SDValue(); // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) return ReplacedSplat; SDLoc DL(S); unsigned NumElts = VT.getVectorNumElements() / 2; // Split VT into two. EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(NumElts, DL, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), S->getAlignment()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), S->getAlignment()); } /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. if (LD->getOpcode() != ISD::LOAD) return SDValue(); LoadSDNode *LoadSDN = cast(LD); EVT MemVT = LoadSDN->getMemoryVT(); // Check if memory operand is the same type as the vector element. if (MemVT != VT.getVectorElementType()) return SDValue(); // Check if there are other uses. If so, do not combine as it will introduce // an extra load. for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. continue; if (*UI != N) return SDValue(); } SDValue Addr = LD->getOperand(1); SDValue Vector = N->getOperand(0); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load. Otherwise, folding it // would create a cycle. if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) continue; // Also check that add is not used in the vector operand. This would also // create a cycle. if (User->isPredecessorOf(Vector.getNode())) continue; // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = VT.getScalarSizeInBits() / 8; if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } // Finally, check that the vector doesn't depend on the load. // Again, this would create a cycle. // The load depending on the vector is fine, as that's the case for the // LD1*post we'll eventually generate anyway. if (LoadSDN->isPredecessorOf(Vector.getNode())) continue; SmallVector Ops; Ops.push_back(LD->getOperand(0)); // Chain if (IsLaneOp) { Ops.push_back(Vector); // The vector to be inserted Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector } Ops.push_back(Addr); Ops.push_back(Inc); EVT Tys[3] = { VT, MVT::i64, MVT::Other }; SDVTList SDTys = DAG.getVTList(Tys); unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT, LoadSDN->getMemOperand()); // Update the uses. SmallVector NewResults; NewResults.push_back(SDValue(LD, 0)); // The result of load NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register break; } return SDValue(); } /// Simplify \Addr given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { DCI.CommitTargetLoweringOpt(TLO); return true; } return false; } static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { SDValue Split = split16BStores(N, DCI, DAG, Subtarget); if (Split.getNode()) return Split; if (Subtarget->supportsAddressTopByteIgnored() && performTBISimplification(N->getOperand(2), DCI, DAG)) return SDValue(N, 0); return SDValue(); } /// This function handles the log2-shuffle pattern produced by the /// LoopVectorizer for the across vector reduction. It consists of /// log2(NumVectorElements) steps and, in each step, 2^(s) elements /// are reduced, where s is an induction variable from 0 to /// log2(NumVectorElements). static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, unsigned Op, SelectionDAG &DAG) { EVT VTy = OpV->getOperand(0).getValueType(); if (!VTy.isVector()) return SDValue(); int NumVecElts = VTy.getVectorNumElements(); if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { if (NumVecElts != 4) return SDValue(); } else { if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) return SDValue(); } int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); SDValue PreOp = OpV; // Iterate over each step of the across vector reduction. for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { SDValue CurOp = PreOp.getOperand(0); SDValue Shuffle = PreOp.getOperand(1); if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { // Try to swap the 1st and 2nd operand as add and min/max instructions // are commutative. CurOp = PreOp.getOperand(1); Shuffle = PreOp.getOperand(0); if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); } // Check if the input vector is fed by the operator we want to handle, // except the last step; the very first input vector is not necessarily // the same operator we are handling. if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) return SDValue(); // Check if it forms one step of the across vector reduction. // E.g., // %cur = add %1, %0 // %shuffle = vector_shuffle %cur, <2, 3, u, u> // %pre = add %cur, %shuffle if (Shuffle.getOperand(0) != CurOp) return SDValue(); int NumMaskElts = 1 << CurStep; ArrayRef Mask = cast(Shuffle)->getMask(); // Check mask values in each step. // We expect the shuffle mask in each step follows a specific pattern // denoted here by the form, where M is a sequence of integers // starting from NumMaskElts, increasing by 1, and the number integers // in M should be NumMaskElts. U is a sequence of UNDEFs and the number // of undef in U should be NumVecElts - NumMaskElts. // E.g., for <8 x i16>, mask values in each step should be : // step 0 : <1,u,u,u,u,u,u,u> // step 1 : <2,3,u,u,u,u,u,u> // step 2 : <4,5,6,7,u,u,u,u> for (int i = 0; i < NumVecElts; ++i) if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || (i >= NumMaskElts && !(Mask[i] < 0))) return SDValue(); PreOp = CurOp; } unsigned Opcode; bool IsIntrinsic = false; switch (Op) { default: llvm_unreachable("Unexpected operator for across vector reduction"); case ISD::ADD: Opcode = AArch64ISD::UADDV; break; case ISD::SMAX: Opcode = AArch64ISD::SMAXV; break; case ISD::UMAX: Opcode = AArch64ISD::UMAXV; break; case ISD::SMIN: Opcode = AArch64ISD::SMINV; break; case ISD::UMIN: Opcode = AArch64ISD::UMINV; break; case ISD::FMAXNUM: Opcode = Intrinsic::aarch64_neon_fmaxnmv; IsIntrinsic = true; break; case ISD::FMINNUM: Opcode = Intrinsic::aarch64_neon_fminnmv; IsIntrinsic = true; break; } SDLoc DL(N); return IsIntrinsic ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), DAG.getConstant(Opcode, DL, MVT::i32), PreOp) : DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), DAG.getConstant(0, DL, MVT::i64)); } /// Target-specific DAG combine for the across vector min/max reductions. /// This function specifically handles the final clean-up step of the vector /// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle /// pattern, which narrows down and finds the final min/max value from all /// elements of the vector. /// For example, for a <16 x i8> vector : /// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> /// %smax0 = smax %arr, svn0 /// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> /// %smax1 = smax %smax0, %svn1 /// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> /// %smax2 = smax %smax1, svn2 /// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> /// %sc = setcc %smax2, %svn3, gt /// %n0 = extract_vector_elt %sc, #0 /// %n1 = extract_vector_elt %smax2, #0 /// %n2 = extract_vector_elt $smax2, #1 /// %result = select %n0, %n1, n2 /// becomes : /// %1 = smaxv %0 /// %result = extract_vector_elt %1, 0 static SDValue performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue IfTrue = N->getOperand(1); SDValue IfFalse = N->getOperand(2); // Check if the SELECT merges up the final result of the min/max // from a vector. if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // Expect N0 is fed by SETCC. SDValue SetCC = N0.getOperand(0); EVT SetCCVT = SetCC.getValueType(); if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || SetCCVT.getVectorElementType() != MVT::i1) return SDValue(); SDValue VectorOp = SetCC.getOperand(0); unsigned Op = VectorOp->getOpcode(); // Check if the input vector is fed by the operator we want to handle. if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) return SDValue(); EVT VTy = VectorOp.getValueType(); if (!VTy.isVector()) return SDValue(); if (VTy.getSizeInBits() < 64) return SDValue(); EVT EltTy = VTy.getVectorElementType(); if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { if (EltTy != MVT::f32) return SDValue(); } else { if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) return SDValue(); } // Check if extracting from the same vector. // For example, // %sc = setcc %vector, %svn1, gt // %n0 = extract_vector_elt %sc, #0 // %n1 = extract_vector_elt %vector, #0 // %n2 = extract_vector_elt $vector, #1 if (!(VectorOp == IfTrue->getOperand(0) && VectorOp == IfFalse->getOperand(0))) return SDValue(); // Check if the condition code is matched with the operator type. ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && CC != ISD::SETGE) || (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && CC != ISD::SETLE)) return SDValue(); // Expect to check only lane 0 from the vector SETCC. if (!isNullConstant(N0.getOperand(1))) return SDValue(); // Expect to extract the true value from lane 0. if (!isNullConstant(IfTrue.getOperand(1))) return SDValue(); // Expect to extract the false value from lane 1. if (!isOneConstant(IfFalse.getOperand(1))) return SDValue(); return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); } /// Target-specific DAG combine for the across vector add reduction. /// This function specifically handles the final clean-up step of the vector /// add reduction produced by the LoopVectorizer. It is the log2-shuffle /// pattern, which adds all elements of a vector together. /// For example, for a <4 x i32> vector : /// %1 = vector_shuffle %0, <2,3,u,u> /// %2 = add %0, %1 /// %3 = vector_shuffle %2, <1,u,u,u> /// %4 = add %2, %3 /// %result = extract_vector_elt %4, 0 /// becomes : /// %0 = uaddv %0 /// %result = extract_vector_elt %0, 0 static SDValue performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Check if the input vector is fed by the ADD. if (N0->getOpcode() != ISD::ADD) return SDValue(); // The vector extract idx must constant zero because we only expect the final // result of the reduction is placed in lane 0. if (!isNullConstant(N1)) return SDValue(); EVT VTy = N0.getValueType(); if (!VTy.isVector()) return SDValue(); EVT EltTy = VTy.getVectorElementType(); if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) return SDValue(); if (VTy.getSizeInBits() < 64) return SDValue(); return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); } /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); unsigned AddrOpIdx = N->getNumOperands() - 1; SDValue Addr = N->getOperand(AddrOpIdx); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) continue; // Find the new opcode for the updating load/store. bool IsStore = false; bool IsLaneOp = false; bool IsDupOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; NumVecs = 2; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; NumVecs = 3; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; NumVecs = 4; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; NumVecs = 2; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; NumVecs = 3; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; NumVecs = 4; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; NumVecs = 2; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; NumVecs = 3; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; NumVecs = 4; IsStore = true; IsLaneOp = true; break; } EVT VecTy; if (IsStore) VecTy = N->getOperand(2).getValueType(); else VecTy = N->getValueType(0); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (IsLaneOp || IsDupOp) NumBytes /= VecTy.getVectorNumElements(); if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } SmallVector Ops; Ops.push_back(N->getOperand(0)); // Incoming chain // Load lane and store have vector list as input. if (IsLaneOp || IsStore) for (unsigned i = 2; i < AddrOpIdx; ++i) Ops.push_back(N->getOperand(i)); Ops.push_back(Addr); // Base register Ops.push_back(Inc); // Return Types. EVT Tys[6]; unsigned NumResultVecs = (IsStore ? 0 : NumVecs); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = VecTy; Tys[n++] = MVT::i64; // Type of write back register Tys[n] = MVT::Other; // Type of the chain SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); MemIntrinsicSDNode *MemInt = cast(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, MemInt->getMemoryVT(), MemInt->getMemOperand()); // Update the uses. std::vector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) { NewResults.push_back(SDValue(UpdN.getNode(), i)); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } // Checks to see if the value is the prescribed width and returns information // about its extension mode. static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { ExtType = ISD::NON_EXTLOAD; switch(V.getNode()->getOpcode()) { default: return false; case ISD::LOAD: { LoadSDNode *LoadNode = cast(V.getNode()); if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { ExtType = LoadNode->getExtensionType(); return true; } return false; } case ISD::AssertSext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::SEXTLOAD; return true; } return false; } case ISD::AssertZext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::ZEXTLOAD; return true; } return false; } case ISD::Constant: case ISD::TargetConstant: { if (std::abs(cast(V.getNode())->getSExtValue()) < 1LL << (width - 1)) return true; return false; } } return true; } // This function does a whole lot of voodoo to determine if the tests are // equivalent without and with a mask. Essentially what happens is that given a // DAG resembling: // // +-------------+ +-------------+ +-------------+ +-------------+ // | Input | | AddConstant | | CompConstant| | CC | // +-------------+ +-------------+ +-------------+ +-------------+ // | | | | // V V | +----------+ // +-------------+ +----+ | | // | ADD | |0xff| | | // +-------------+ +----+ | | // | | | | // V V | | // +-------------+ | | // | AND | | | // +-------------+ | | // | | | // +-----+ | | // | | | // V V V // +-------------+ // | CMP | // +-------------+ // // The AND node may be safely removed for some combinations of inputs. In // particular we need to take into account the extension type of the Input, // the exact values of AddConstant, CompConstant, and CC, along with the nominal // width of the input (this can work for any width inputs, the above graph is // specific to 8 bits. // // The specific equations were worked out by generating output tables for each // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The // problem was simplified by working with 4 bit inputs, which means we only // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 // patterns present in both extensions (0,7). For every distinct set of // AddConstant and CompConstants bit patterns we can consider the masked and // unmasked versions to be equivalent if the result of this function is true for // all 16 distinct bit patterns of for the current extension type of Input (w0). // // sub w8, w0, w1 // and w10, w8, #0x0f // cmp w8, w2 // cset w9, AArch64CC // cmp w10, w2 // cset w11, AArch64CC // cmp w9, w11 // cset w0, eq // ret // // Since the above function shows when the outputs are equivalent it defines // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and // would be expensive to run during compiles. The equations below were written // in a test harness that confirmed they gave equivalent outputs to the above // for all inputs function, so they can be used determine if the removal is // legal instead. // // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, signed AddConstant, signed CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. signed MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer // width. Provided we are careful and make sure our equations are valid over // the whole range we can just adjust the input and avoid writing equations // for sign extended inputs. if (ExtType == ISD::SEXTLOAD) AddConstant -= (1 << (width-1)); switch(CC) { case AArch64CC::LE: case AArch64CC::GT: { if ((AddConstant == 0) || (CompConstant == MaxUInt - 1 && AddConstant < 0) || (AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) return true; } break; case AArch64CC::LT: case AArch64CC::GE: { if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) return true; } break; case AArch64CC::HI: case AArch64CC::LS: { if ((AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant >= -1 && CompConstant < AddConstant + MaxUInt)) return true; } break; case AArch64CC::PL: case AArch64CC::MI: { if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) || (AddConstant < 0 && CompConstant <= AddConstant)) return true; } break; case AArch64CC::LO: case AArch64CC::HS: { if ((AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant >= 0 && CompConstant <= AddConstant + MaxUInt)) return true; } break; case AArch64CC::EQ: case AArch64CC::NE: { if ((AddConstant > 0 && CompConstant < 0) || (AddConstant < 0 && CompConstant >= 0 && CompConstant < AddConstant + MaxUInt) || (AddConstant >= 0 && CompConstant >= 0 && CompConstant >= AddConstant) || (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) return true; } break; case AArch64CC::VS: case AArch64CC::VC: case AArch64CC::AL: case AArch64CC::NV: return true; case AArch64CC::Invalid: break; } return false; } static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex) { unsigned CC = cast(N->getOperand(CCIndex))->getSExtValue(); SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); unsigned CondOpcode = SubsNode->getOpcode(); if (CondOpcode != AArch64ISD::SUBS) return SDValue(); // There is a SUBS feeding this condition. Is it fed by a mask we can // use? SDNode *AndNode = SubsNode->getOperand(0).getNode(); unsigned MaskBits = 0; if (AndNode->getOpcode() != ISD::AND) return SDValue(); if (ConstantSDNode *CN = dyn_cast(AndNode->getOperand(1))) { uint32_t CNV = CN->getZExtValue(); if (CNV == 255) MaskBits = 8; else if (CNV == 65535) MaskBits = 16; } if (!MaskBits) return SDValue(); SDValue AddValue = AndNode->getOperand(0); if (AddValue.getOpcode() != ISD::ADD) return SDValue(); // The basic dag structure is correct, grab the inputs and validate them. SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); SDValue SubsInputValue = SubsNode->getOperand(1); // The mask is present and the provenance of all the values is a smaller type, // lets see if the mask is superfluous. if (!isa(AddInputValue2.getNode()) || !isa(SubsInputValue.getNode())) return SDValue(); ISD::LoadExtType ExtType; if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || !checkValueWidth(AddInputValue2, MaskBits, ExtType) || !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) return SDValue(); if(!isEquivalentMaskless(CC, MaskBits, ExtType, cast(AddInputValue2.getNode())->getSExtValue(), cast(SubsInputValue.getNode())->getSExtValue())) return SDValue(); // The AND is not necessary, remove it. SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), SubsNode->getValueType(1)); SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); return SDValue(N, 0); } // Optimize compare with zero and branch. static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); if (NV.getNode()) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue CCVal = N->getOperand(2); SDValue Cmp = N->getOperand(3); assert(isa(CCVal) && "Expected a ConstantSDNode here!"); unsigned CC = cast(CCVal)->getZExtValue(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) return SDValue(); unsigned CmpOpc = Cmp.getOpcode(); if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) return SDValue(); // Only attempt folding if there is only one use of the flag and no use of the // value. if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) return SDValue(); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); assert(LHS.getValueType() == RHS.getValueType() && "Expected the value type to be the same for both operands!"); if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); if (isNullConstant(LHS)) std::swap(LHS, RHS); if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || LHS.getOpcode() == ISD::SRL) return SDValue(); // Fold the compare into the branch instruction. SDValue BR; if (CC == AArch64CC::EQ) BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); else BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, BR, false); return SDValue(); } // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test // as well as whether the test should be inverted. This code is required to // catch these cases (as opposed to standard dag combines) because // AArch64ISD::TBZ is matched during legalization. static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG) { if (!Op->hasOneUse()) return Op; // We don't handle undef/constant-fold cases below, as they should have // already been taken care of (e.g. and of 0, test of undefined shifted bits, // etc.) // (tbz (trunc x), b) -> (tbz x, b) // This case is just here to enable more of the below cases to be caught. if (Op->getOpcode() == ISD::TRUNCATE && Bit < Op->getValueType(0).getSizeInBits()) { return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } if (Op->getNumOperands() != 2) return Op; auto *C = dyn_cast(Op->getOperand(1)); if (!C) return Op; switch (Op->getOpcode()) { default: return Op; // (tbz (and x, m), b) -> (tbz x, b) case ISD::AND: if ((C->getZExtValue() >> Bit) & 1) return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); return Op; // (tbz (shl x, c), b) -> (tbz x, b-c) case ISD::SHL: if (C->getZExtValue() <= Bit && (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit - C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x case ISD::SRA: Bit = Bit + C->getZExtValue(); if (Bit >= Op->getValueType(0).getSizeInBits()) Bit = Op->getValueType(0).getSizeInBits() - 1; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); // (tbz (srl x, c), b) -> (tbz x, b+c) case ISD::SRL: if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit + C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (xor x, -1), b) -> (tbnz x, b) case ISD::XOR: if ((C->getZExtValue() >> Bit) & 1) Invert = !Invert; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } } // Optimize test single bit zero/non-zero and branch. static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { unsigned Bit = cast(N->getOperand(2))->getZExtValue(); bool Invert = false; SDValue TestSrc = N->getOperand(1); SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); if (TestSrc == NewTestSrc) return SDValue(); unsigned NewOpc = N->getOpcode(); if (Invert) { if (NewOpc == AArch64ISD::TBZ) NewOpc = AArch64ISD::TBNZ; else { assert(NewOpc == AArch64ISD::TBNZ); NewOpc = AArch64ISD::TBZ; } } SDLoc DL(N); return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); } // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine // such VSELECT. static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT CCVT = N0.getValueType(); if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || CCVT.getVectorElementType() != MVT::i1) return SDValue(); EVT ResVT = N->getValueType(0); EVT CmpVT = N0.getOperand(0).getValueType(); // Only combine when the result type is of the same size as the compared // operands. if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) return SDValue(); SDValue IfTrue = N->getOperand(1); SDValue IfFalse = N->getOperand(2); SDValue SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, IfTrue, IfFalse); } /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with /// the compare-mask instructions rather than going via NZCV, even if LHS and /// RHS are really scalar. This replaces any scalar setcc in the above pattern /// with a vector one followed by a DUP shuffle on the result. static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); if (N0.getOpcode() != ISD::SETCC) return SDValue(); // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered // scalar SetCCResultType. We also don't expect vectors, because we assume // that selects fed by vector SETCCs are canonicalized to VSELECT. assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && "Scalar-SETCC feeding SELECT has unexpected result type!"); // If NumMaskElts == 0, the comparison is larger than select result. The // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. EVT SrcVT = N0.getOperand(0).getValueType(); // Don't try to do this optimization when the setcc itself has i1 operands. // There are no legal vectors of i1, so this would be pointless. if (SrcVT == MVT::i1) return SDValue(); int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); // Also bail out if the vector CCVT isn't the same size as ResVT. // This can happen if the SETCC operand size doesn't divide the ResVT size // (e.g., f64 vs v3f32). if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) return SDValue(); // Make sure we didn't create illegal types, if we're not supposed to. assert(DCI.isBeforeLegalize() || DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); // First perform a vector comparison, where lane 0 is the one we're interested // in. SDLoc DL(N0); SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); SDValue RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); // Now duplicate the comparison mask we want across all other lanes. SmallVector DUPMask(CCVT.getVectorNumElements(), 0); SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) return N->getOperand(0); return SDValue(); } SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: return performAddSubLongCombine(N, DCI, DAG); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return performFpToIntCombine(N, DAG, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: return performExtendCombine(N, DCI, DAG); case ISD::BITCAST: return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: { SDValue RV = performSelectCombine(N, DCI); if (!RV.getNode()) RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); return RV; } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: if (performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: case AArch64ISD::TBZ: return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: return performPostLD1Combine(N, DCI, false); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::EXTRACT_VECTOR_ELT: return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); default: break; } } return SDValue(); } // Check if the return value is used as only a return value, as otherwise // we can't perform a tail-call. In particular, we need to check for // target ISD nodes that are returns and any other "odd" constructs // that the generic analysis code won't necessarily catch. bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode *Node : Copy->uses()) { if (Node->getOpcode() != AArch64ISD::RET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } // Return whether the an instruction can potentially be optimized to a tail // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall()) return false; return true; } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const { if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) return false; Base = Op->getOperand(0); // All of the indexed addressing mode instructions take a signed // 9 bit immediate offset. if (ConstantSDNode *RHS = dyn_cast(Op->getOperand(1))) { int64_t RHSC = (int64_t)RHS->getZExtValue(); if (RHSC >= 256 || RHSC <= -256) return false; IsInc = (Op->getOpcode() == ISD::ADD); Offset = Op->getOperand(1); return true; } return false; } bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool IsInc; if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) return false; AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; return true; } bool AArch64TargetLowering::getPostIndexedAddressParts( SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool IsInc; if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) return false; // Post-indexing updates the base, so it's not a valid transform // if that's not the same as the load's pointer. if (Ptr != Base) return false; AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; return true; } static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op = N->getOperand(0); if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) return; Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } static void ReplaceReductionResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp) { EVT LoVT, HiVT; SDValue Lo, Hi; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); Results.push_back(SplitVal); } void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this"); case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; case AArch64ISD::UADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); return; case AArch64ISD::SMINV: ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); return; case AArch64ISD::UMINV: ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); return; case AArch64ISD::SMAXV: ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); return; case AArch64ISD::UMAXV: ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; } } bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. return 3; } TargetLoweringBase::LegalizeTypeAction AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { MVT SVT = VT.getSimpleVT(); // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, // v4i16, v2i32 instead of to promote. if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 || SVT == MVT::v1f32) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); return Size == 128; } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { return true; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); bool IsAcquire = isAtLeastAcquire(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a // single i128 here. if (ValTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldxr, Addr), cast(Addr->getType())->getElementType()); } void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall( llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isAtLeastRelease(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateCall(Stxr, {Builder.CreateZExtOrBitCast( Val, Stxr->getFunctionType()->getParamType(0)), Addr}); } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, EVT) const { return false; } Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (!Subtarget->isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h const unsigned TlsOffset = 0x48; Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void AArch64TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (AArch64::GPR64RegClass.contains(*I)) RC = &AArch64::GPR64RegClass; else if (AArch64::FPR64RegClass.contains(*I)) RC = &AArch64::FPR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); - BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - NewVR) + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) - BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - *I) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } Index: projects/clang380-import/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp (revision 294609) @@ -1,205 +1,217 @@ //===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file assembles .s files and emits AArch64 ELF .o object files. Different // from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit // regions of data and code. // //===----------------------------------------------------------------------===// #include "AArch64TargetStreamer.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; namespace { class AArch64ELFStreamer; class AArch64TargetAsmStreamer : public AArch64TargetStreamer { formatted_raw_ostream &OS; void emitInst(uint32_t Inst) override; public: AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); }; AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : AArch64TargetStreamer(S), OS(OS) {} void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) { OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n"; } class AArch64TargetELFStreamer : public AArch64TargetStreamer { private: AArch64ELFStreamer &getStreamer(); void emitInst(uint32_t Inst) override; public: AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {} }; /// Extend the generic ELFStreamer class so that it can emit mapping symbols at /// the appropriate points in the object files. These symbols are defined in the /// AArch64 ELF ABI: /// infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf /// /// In brief: $x or $d should be emitted at the start of each contiguous region /// of A64 code or data in a section. In practice, this emission does not rely /// on explicit assembler directives but on inherent properties of the /// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an /// instruction). /// /// As a result this system is orthogonal to the DataRegion infrastructure used /// by MachO. Beware! class AArch64ELFStreamer : public MCELFStreamer { public: friend class AArch64TargetELFStreamer; AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS, MCCodeEmitter *Emitter) : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0), LastEMS(EMS_None) {} void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { // We have to keep track of the mapping symbol state of any sections we // use. Each one should start off as EMS_None, which is provided as the // default constructor by DenseMap::lookup. LastMappingSymbols[getPreviousSection().first] = LastEMS; LastEMS = LastMappingSymbols.lookup(Section); MCELFStreamer::ChangeSection(Section, Subsection); } /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override { EmitA64MappingSymbol(); MCELFStreamer::EmitInstruction(Inst, STI); } + /// Emit a 32-bit value as an instruction. This is only used for the .inst + /// directive, EmitInstruction should be used in other cases. void emitInst(uint32_t Inst) { + char Buffer[4]; + + // We can't just use EmitIntValue here, as that will emit a data mapping + // symbol, and swap the endianness on big-endian systems (instructions are + // always little-endian). + for (unsigned I = 0; I < 4; ++I) { + Buffer[I] = uint8_t(Inst); + Inst >>= 8; + } + EmitA64MappingSymbol(); - MCELFStreamer::EmitIntValue(Inst, 4); + MCELFStreamer::EmitBytes(StringRef(Buffer, 4)); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. void EmitBytes(StringRef Data) override { EmitDataMappingSymbol(); MCELFStreamer::EmitBytes(Data); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { EmitDataMappingSymbol(); MCELFStreamer::EmitValueImpl(Value, Size, Loc); } private: enum ElfMappingSymbol { EMS_None, EMS_A64, EMS_Data }; void EmitDataMappingSymbol() { if (LastEMS == EMS_Data) return; EmitMappingSymbol("$d"); LastEMS = EMS_Data; } void EmitA64MappingSymbol() { if (LastEMS == EMS_A64) return; EmitMappingSymbol("$x"); LastEMS = EMS_A64; } void EmitMappingSymbol(StringRef Name) { auto *Symbol = cast(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); EmitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); Symbol->setExternal(false); } int64_t MappingSymbolCounter; DenseMap LastMappingSymbols; ElfMappingSymbol LastEMS; }; } // end anonymous namespace AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { return static_cast(Streamer); } void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { getStreamer().emitInst(Inst); } namespace llvm { MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, bool isVerboseAsm) { return new AArch64TargetAsmStreamer(S, OS); } MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS, MCCodeEmitter *Emitter, bool RelaxAll) { AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter); if (RelaxAll) S->getAssembler().setRelaxAll(true); return S; } MCTargetStreamer * createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); if (TT.isOSBinFormatELF()) return new AArch64TargetELFStreamer(S); return nullptr; } } Index: projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp (revision 294609) @@ -1,12454 +1,12455 @@ //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that ARM uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "ARMISelLowering.h" #include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "ARMPerfectShuffle.h" #include "ARMSubtarget.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include using namespace llvm; #define DEBUG_TYPE "arm-isel" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); static cl::opt ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); namespace { class ARMCCState : public CCState { public: ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, SmallVectorImpl &locs, LLVMContext &C, ParmContext PC) : CCState(CC, isVarArg, MF, locs, C) { assert(((PC == Call) || (PC == Prologue)) && "ARMCCState users must specify whether their context is call" "or prologue generation."); CallOrPrologue = PC; } }; } // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); setOperationAction(ISD::STORE, VT, Promote); AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); } MVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::i64 && ElemTy != MVT::f64) setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); if (ElemTy == MVT::i32) { setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); } else { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); } setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); } // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); } // Neon does not support vector divide/remainder operations. setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPRRegClass); addTypeForNEON(VT, MVT::f64, MVT::v2i32); } void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPairRegClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { RegInfo = Subtarget->getRegisterInfo(); Itins = Subtarget->getInstrItineraryData(); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const ISD::CondCode Cond; } LibraryCalls[] = { // Single-precision floating-point arithmetic. { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, // Double-precision floating-point arithmetic. { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, // Single-precision comparisons. { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, // Double-precision comparisons. { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, // Floating-point to integer conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, // Conversions between floating types. { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, // Integer to floating-point conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. // FIXME: There appears to be some naming inconsistency in ARM libgcc: // e.g., __floatunsidf vs. __floatunssidfvfp. { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } } // Set the correct calling convention for ARMv7k WatchOS. It's just // AAPCS_VFP for functions as simple as libcalls. if (Subtarget->isTargetWatchOS()) { for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); } } // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); // RTLIB if (Subtarget->isAAPCS_ABI() && (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; const ISD::CondCode Cond; } LibraryCalls[] = { // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Double-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 3 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Single-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 4 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Single-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 5 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Floating-point to integer conversions. // RTABI chapter 4.1.2, Table 6 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Conversions between floating types. // RTABI chapter 4.1.2, Table 7 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Integer to floating-point conversions. // RTABI chapter 4.1.2, Table 8 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Long long helper functions // RTABI chapter 4.2, Table 9 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Integer division functions // RTABI chapter 4.3.1 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } // EABI dependent RTLIB if (TM.Options.EABIVersion == EABI::EABI4 || TM.Options.EABIVersion == EABI::EABI5) { static const struct { const RTLIB::Libcall Op; const char *const Name; const CallingConv::ID CC; const ISD::CondCode Cond; } MemOpsLibraryCalls[] = { // Memory operations // RTABI chapter 4.3.4 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : MemOpsLibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } } } if (Subtarget->isTargetWindows()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UDIV_I32, "__rt_udiv", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } // Use divmod compiler-rt calls for iOS 5.0 and later. if (Subtarget->isTargetWatchOS() || (Subtarget->isTargetIOS() && !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } // The half <-> float conversion functions are always soft-float, but are // needed for some targets which use a hard-float calling convention by // default. if (Subtarget->isAAPCS_ABI()) { setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); } else { setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); } // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have // a __gnu_ prefix (which is the default). if (Subtarget->isTargetAEABI()) { setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); } if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else addRegisterClass(MVT::i32, &ARM::GPRRegClass); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); } for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); // v2f64 is legal so that QR subregs can be extracted as f64 elements, but // neither Neon nor VFP support any arithmetic operations on it. // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively // supported for v4f32. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); // FIXME: Code duplication: FDIV and FREM are expanded always, see // ARMTargetLowering::addTypeForNEON method for details. setOperationAction(ISD::FDIV, MVT::v2f64, Expand); setOperationAction(ISD::FREM, MVT::v2f64, Expand); // FIXME: Create unittest. // In another words, find a way when "copysign" appears in DAG with vector // operands. setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); // FIXME: Code duplication: SETCC has custom operation action, see // ARMTargetLowering::addTypeForNEON method for details. setOperationAction(ISD::SETCC, MVT::v2f64, Expand); // FIXME: Create unittest for FNEG and for FABS. setOperationAction(ISD::FNEG, MVT::v2f64, Expand); setOperationAction(ISD::FABS, MVT::v2f64, Expand); setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); setOperationAction(ISD::FSIN, MVT::v2f64, Expand); setOperationAction(ISD::FCOS, MVT::v2f64, Expand); setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); setOperationAction(ISD::FPOW, MVT::v2f64, Expand); setOperationAction(ISD::FLOG, MVT::v2f64, Expand); setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); setOperationAction(ISD::FEXP, MVT::v2f64, Expand); setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); setOperationAction(ISD::FRINT, MVT::v2f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); setOperationAction(ISD::FPOW, MVT::v4f32, Expand); setOperationAction(ISD::FLOG, MVT::v4f32, Expand); setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); setOperationAction(ISD::FEXP, MVT::v4f32, Expand); setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); setOperationAction(ISD::FRINT, MVT::v4f32, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); // Mark v2f32 intrinsics. setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); setOperationAction(ISD::FSIN, MVT::v2f32, Expand); setOperationAction(ISD::FCOS, MVT::v2f32, Expand); setOperationAction(ISD::FPOWI, MVT::v2f32, Expand); setOperationAction(ISD::FPOW, MVT::v2f32, Expand); setOperationAction(ISD::FLOG, MVT::v2f32, Expand); setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); setOperationAction(ISD::FEXP, MVT::v2f32, Expand); setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); setOperationAction(ISD::FRINT, MVT::v2f32, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); // Custom handling for some quad-vector types to detect VMULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Custom handling for some vector types to avoid expensive expansions setOperationAction(ISD::SDIV, MVT::v4i16, Custom); setOperationAction(ISD::SDIV, MVT::v8i8, Custom); setOperationAction(ISD::UDIV, MVT::v4i16, Custom); setOperationAction(ISD::UDIV, MVT::v8i8, Custom); setOperationAction(ISD::SETCC, MVT::v1i64, Expand); setOperationAction(ISD::SETCC, MVT::v2i64, Expand); // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with // a destination type that is wider than the source, and nor does // it have a FP_TO_[SU]INT instruction with a narrower destination than // source. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); // NEON does not have single instruction CTTZ for vectors. setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); setOperationAction(ISD::FMA, MVT::v4f32, Expand); } setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, MVT::v2i32}) { for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); } } } // ARM and Thumb2 support UMLAL/SMLAL. if (!Subtarget->isThumb1Only()) setTargetDAGCombine(ISD::ADDC); if (Subtarget->isFPOnlySP()) { // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, // loads and stores are provided by the hardware. setOperationAction(ISD::FADD, MVT::f64, Expand); setOperationAction(ISD::FSUB, MVT::f64, Expand); setOperationAction(ISD::FMUL, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FDIV, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); setOperationAction(ISD::FNEG, MVT::f64, Expand); setOperationAction(ISD::FABS, MVT::f64, Expand); setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOWI, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FLOG, MVT::f64, Expand); setOperationAction(ISD::FLOG2, MVT::f64, Expand); setOperationAction(ISD::FLOG10, MVT::f64, Expand); setOperationAction(ISD::FEXP, MVT::f64, Expand); setOperationAction(ISD::FEXP2, MVT::f64, Expand); setOperationAction(ISD::FCEIL, MVT::f64, Expand); setOperationAction(ISD::FTRUNC, MVT::f64, Expand); setOperationAction(ISD::FRINT, MVT::f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); setOperationAction(ISD::FFLOOR, MVT::f64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); } computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); } // ... or truncating stores setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); // ARM does not have i1 sign extending load. for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // ARM supports all 4 flavors of integer indexed load / store. if (!Subtarget->isThumb1Only()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i1, Legal); setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i1, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); } } setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); // i64 operation support. setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i32, Expand); if (Subtarget->isThumb1Only()) { setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); } if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() || (Subtarget->isThumb2() && !Subtarget->hasDSP())) setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i64, Custom); setOperationAction(ISD::SRA, MVT::i64, Custom); if (!Subtarget->isThumb1Only()) { // FIXME: We should do this for Thumb1 as well. setOperationAction(ISD::ADDC, MVT::i32, Custom); setOperationAction(ISD::ADDE, MVT::i32, Custom); setOperationAction(ISD::SUBC, MVT::i32, Custom); setOperationAction(ISD::SUBE, MVT::i32, Custom); } if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); // ARM does not have ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) setOperationAction(ISD::CTLZ, MVT::i32, Expand); // These just redirect to CTTZ and CTLZ on ARM. setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); // @llvm.readcyclecounter requires the Performance Monitors extension. // Default to the 0 expansion on unsupported platforms. // FIXME: Technically there are older ARM CPUs that have // implementation-specific ways of obtaining this information. if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { // These are expanded into libcalls if the cpu doesn't have HW divider. setOperationAction(ISD::SDIV, MVT::i32, LibCall); setOperationAction(ISD::UDIV, MVT::i32, LibCall); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod"); setLibcallName(RTLIB::UDIVREM_I8, "__aeabi_uidivmod"); setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod"); setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod"); setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod"); setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); setOperationAction(ISD::SDIVREM, MVT::i32, Custom); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); } else { setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); } setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Expand); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. If we are targeting a single threaded system, // then set them all for expand so we can lower them later into their // non-atomic form. if (TM.Options.ThreadModel == ThreadModel::Single) setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { // ATOMIC_FENCE needs custom lowering; the others should have been expanded // to ldrex/strex loops already. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // On v8, we have particularly efficient implementations of atomic fences // if they can be combined with nearby atomic loads and stores. if (!Subtarget->hasV8Ops()) { // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. setInsertFencesForAtomic(true); } } else { // If there's anything we can use as a barrier, go through custom lowering // for ATOMIC_FENCE. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Subtarget->hasAnyDataBarrier() ? Custom : Expand); // Set them all for expansion, which will force libcalls. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the // Unordered/Monotonic case. setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. if (!Subtarget->hasV6Ops()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. setOperationAction(ISD::BITCAST, MVT::i64, Custom); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (Subtarget->useSjLjEH()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::SETCC, MVT::f64, Expand); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Custom); // We don't support sin/cos/fmod/copysign/pow setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); } setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); } // Various VFP goodness if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } } // Combine sin / cos into one node or libcall if possible. if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetWatchOS()) { setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); } if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } } // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); if (!Subtarget->isFPOnlySP()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); } } if (Subtarget->hasNEON()) { // vmin and vmax aren't available in a scalar form, so we use // a NEON instruction with an undef lane instead. setOperationAction(ISD::FMINNAN, MVT::f32, Legal); setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); } // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); setStackPointerRegisterToSaveRestore(ARM::SP); if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) setSchedulingPreference(Sched::RegPressure); else setSchedulingPreference(Sched::Hybrid); //// temporary - rewrite interface to use type MaxStoresPerMemset = 8; MaxStoresPerMemsetOptSize = 4; MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 2; MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. setMinStackArgumentAlignment(4); // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->isLikeA9(); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, // SPR's representative would be DPR_VFP2. This should work well if register // pressure tracking were modified such that a register use would increment the // pressure of the register class's representative and all of it's super // classes' representatives transitively. We have not implemented this because // of the difficulty prior to coalescing of modeling operand register classes // due to the common occurrence of cross class copies and subregister insertions // and extractions. std::pair ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); // Use DPR as representative register class for all floating point // and vector types. Since there are 32 SPR registers and 32 DPR registers so // the cost is 1 for both f32 and f64. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: RRC = &ARM::DPRRegClass; // When NEON is used for SP, only half of the register file is available // because operations that define both SP and DP results will be constrained // to the VFP2 class (D0-D15). We currently model this constraint prior to // coalescing by double-counting the SP regs. See the FIXME above. if (Subtarget->useNEONForSinglePrecisionFP()) Cost = 2; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: RRC = &ARM::DPRRegClass; Cost = 2; break; case MVT::v4i64: RRC = &ARM::DPRRegClass; Cost = 4; break; case MVT::v8i64: RRC = &ARM::DPRRegClass; Cost = 8; break; } return std::make_pair(RRC, Cost); } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((ARMISD::NodeType)Opcode) { case ARMISD::FIRST_NUMBER: break; case ARMISD::Wrapper: return "ARMISD::Wrapper"; case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; case ARMISD::tCALL: return "ARMISD::tCALL"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMN: return "ARMISD::CMN"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; case ARMISD::ADDC: return "ARMISD::ADDC"; case ARMISD::ADDE: return "ARMISD::ADDE"; case ARMISD::SUBC: return "ARMISD::SUBC"; case ARMISD::SUBE: return "ARMISD::SUBE"; case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; case ARMISD::VCGE: return "ARMISD::VCGE"; case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; case ARMISD::VCGEU: return "ARMISD::VCGEU"; case ARMISD::VCGT: return "ARMISD::VCGT"; case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; case ARMISD::VSHL: return "ARMISD::VSHL"; case ARMISD::VSHRs: return "ARMISD::VSHRs"; case ARMISD::VSHRu: return "ARMISD::VSHRu"; case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; case ARMISD::VSLI: return "ARMISD::VSLI"; case ARMISD::VSRI: return "ARMISD::VSRI"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; case ARMISD::VDUP: return "ARMISD::VDUP"; case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; case ARMISD::VEXT: return "ARMISD::VEXT"; case ARMISD::VREV64: return "ARMISD::VREV64"; case ARMISD::VREV32: return "ARMISD::VREV32"; case ARMISD::VREV16: return "ARMISD::VREV16"; case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; } return nullptr; } EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(DL); return VT.changeVectorElementTypeToInteger(); } /// getRegClassFor - Return the register class that should be used for the /// specified value type. const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive D registers. if (Subtarget->hasNEON()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) return &ARM::QQQQPRRegClass; } return TargetLowering::getRegClassFor(VT); } // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the // source/dest is aligned and the copy size is large enough. We therefore want // to align such objects passed to memory intrinsics. bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const { if (!isa(CI)) return false; MinSize = 8; // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 // cycle faster than 4-byte aligned LDM. PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); return true; } // Create a fast isel object. FastISel * ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return ARM::createFastISel(funcInfo, libInfo); } Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { unsigned NumVals = N->getNumValues(); if (!NumVals) return Sched::RegPressure; for (unsigned i = 0; i != NumVals; ++i) { EVT VT = N->getValueType(i); if (VT == MVT::Glue || VT == MVT::Other) continue; if (VT.isFloatingPoint() || VT.isVector()) return Sched::ILP; } if (!N->isMachineOpcode()) return Sched::RegPressure; // Load are scheduled for latency even if there instruction itinerary // is not available. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); if (MCID.getNumDefs() == 0) return Sched::RegPressure; if (!Itins->isEmpty() && Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) return Sched::ILP; return Sched::RegPressure; } //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return ARMCC::NE; case ISD::SETEQ: return ARMCC::EQ; case ISD::SETGT: return ARMCC::GT; case ISD::SETGE: return ARMCC::GE; case ISD::SETLT: return ARMCC::LT; case ISD::SETLE: return ARMCC::LE; case ISD::SETUGT: return ARMCC::HI; case ISD::SETUGE: return ARMCC::HS; case ISD::SETULT: return ARMCC::LO; case ISD::SETULE: return ARMCC::LS; } } /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2) { CondCode2 = ARMCC::AL; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = ARMCC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = ARMCC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; case ISD::SETOLE: CondCode = ARMCC::LS; break; case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; case ISD::SETUGT: CondCode = ARMCC::HI; break; case ISD::SETUGE: CondCode = ARMCC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = ARMCC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = ARMCC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = ARMCC::NE; break; } } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// #include "ARMGenCallingConv.inc" /// getEffectiveCallingConv - Get the effective calling convention, taking into /// account presence of floating point hardware and calling convention /// limitations, such as support for variadic functions. CallingConv::ID ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, bool isVarArg) const { switch (CC) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::ARM_AAPCS: case CallingConv::ARM_APCS: case CallingConv::GHC: return CC; case CallingConv::ARM_AAPCS_VFP: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && getTargetMachine().Options.FloatABIType == FloatABI::Hard && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; case CallingConv::Fast: case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; return CallingConv::ARM_APCS; } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; } } /// CCAssignFnForNode - Selects the correct CCAssignFn for the given /// CallingConvention. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const { switch (getEffectiveCallingConv(CC, isVarArg)) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::ARM_APCS: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); case CallingConv::ARM_AAPCS: return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); case CallingConv::ARM_AAPCS_VFP: return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); case CallingConv::Fast: return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); case CallingConv::GHC: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { // Assign locations to each value returned by this call. SmallVector RVLocs; ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext(), Call); CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv, /* Return*/ true, isVarArg)); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } SDValue Val; if (VA.needsCustom()) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); InFlag = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Hi.getValue(1); InFlag = Hi.getValue(2); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); if (VA.getLocVT() == MVT::v2f64) { SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, DAG.getConstant(0, dl, MVT::i32)); VA = RVLocs[++i]; // skip ahead to next loc Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); InFlag = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Hi.getValue(1); InFlag = Hi.getValue(2); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, DAG.getConstant(1, dl, MVT::i32)); } } else { Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } /// LowerMemOpCallTo - Store the argument to the stack. SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), false, false, 0); } void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); unsigned id = Subtarget->isLittle() ? 0 : 1; RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); if (NextVA.isRegLoc()) RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); else { assert(NextVA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), dl, DAG, NextVA, Flags)); } } /// LowerCall - Lowering a call into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. SDValue ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool doesNotRet = CLI.DoesNotReturn; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; bool isSibCall = false; auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) { ++NumTailCalls; isSibCall = true; } } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(), Call); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); // For tail calls, memory operands are available in our caller's stack. if (isSibCall) NumBytes = 0; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); RegsToPassVector RegsToPass; SmallVector MemOpChains; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization, arguments are handled later. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } // f64 and v2f64 might be passed in i32 pairs and must be split into pieces if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, dl, MVT::i32)); PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); VA = ArgLocs[++i]; // skip ahead to next loc if (VA.isRegLoc()) { PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } else { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); } } else { PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { assert(VA.getLocVT() == MVT::i32 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i32 && "unexpected use of 'returned'"); isThisReturn = true; } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); unsigned offset = 0; // True if this byval aggregate will be split between registers // and memory. unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); if (CurByValIdx < ByValArgsCount) { unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), false, false, false, DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } // If parameter size outsides register area, "offset" value // helps us to calculate stack slot for remained part properly. offset = RegEnd - RegBegin; CCInfo.nextInRegsParam(); } if (Flags.getByValSize() > 4*offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); unsigned LocMemOffset = VA.getLocMemOffset(); SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } } else if (!isSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; // Tail call byval lowering might overwrite argument registers so in case of // tail call optimization the copies to registers are lowered later. if (!isTailCall) for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } // For tail calls lower the arguments to the 'real' stack slot. if (isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } InFlag = SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. bool isDirect = false; bool isARMFunc = false; bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { assert((Subtarget->isTargetWindows() || getTargetMachine().getRelocationModel() == Reloc::Static) && "long-calls with non-static relocation model!"); // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); isDirect = true; bool isDef = GV->isStrongDefinitionForLinker(); bool isStub = (!isDef && Subtarget->isTargetMachO()) && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); // tBX takes a register source operand. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode( ARMISD::WrapperPIC, dl, PtrVt, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, true, 0); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; bool isStub = Subtarget->isTargetMachO() && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); // tBX takes a register source operand. const char *Sym = S->getSymbol(); if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { unsigned OpFlags = 0; // On ELF targets for PIC code, direct calls should go through the PLT if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); } } // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; } else { if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRAS() && // Emit regular call when code size is the priority !MF.getFunction()->optForMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. if (!isTailCall) { const uint32_t *Mask; const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); if (isThisReturn) { // For 'this' returns, use the R0-preserving mask if applicable Mask = ARI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { // Set isThisReturn to false if the calling convention is not one that // allows 'returned' to be modeled in this way, so LowerCallResult does // not try to pass 'this' straight through isThisReturn = false; Mask = ARI->getCallPreservedMask(MF, CallConv); } } else Mask = ARI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); } if (InFlag.getNode()) Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (isTailCall) { MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals, isThisReturn, isThisReturn ? OutVals[0] : SDValue()); } /// HandleByVal - Every parameter *after* a byval parameter is passed /// on the stack. Remember the next parameter register to allocate, /// and then confiscate the rest of the parameter registers to insure /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, unsigned Align) const { assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); // Byval (as with any stack) slots are always at least 4 byte aligned. Align = std::max(Align, 4U); unsigned Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; unsigned AlignInRegs = Align / 4; unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; unsigned Excess = 4 * (ARM::R4 - Reg); // Special case when NSAA != SP and parameter size greater than size of // all remained GPR regs. In that case we can't split parameter, we must // send it to stack. We also must set NCRN to R4, so waste all // remained registers. const unsigned NSAAOffset = State->getNextStackOffset(); if (NSAAOffset != 0 && Size > Excess) { while (State->AllocateReg(GPRArgRegs)) ; return; } // First register for byval parameter is the first register that wasn't // allocated before this method call, so it would be "reg". // If parameter is small enough to be saved in range [reg, r4), then // the end (first after last) register would be reg + param-size-in-regs, // else parameter would be splitted between registers and stack, // end register would be r4 in this case. unsigned ByValRegBegin = Reg; unsigned ByValRegEnd = std::min(Reg + Size / 4, ARM::R4); State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); // Note, first register is allocated in the beginning of function already, // allocate remained amount of registers we need. for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) State->AllocateReg(GPRArgRegs); // A byval parameter that is split between registers and memory needs its // size truncated here. // In the case where the entire structure fits in registers, we set the // size in memory to zero. Size = std::max(Size - Excess, 0); } /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII) { unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(Def, FI)) return false; } else { return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else return false; assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { const Function *CallerF = DAG.getMachineFunction().getFunction(); CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; assert(Subtarget->supportsTailCall()); // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Do not sibcall optimize vararg calls unless the call site is not passing // any arguments. if (isVarArg && !Outs.empty()) return false; // Exception-handling functions need a special set of instructions to indicate // a return to the hardware. Tail-calling another function would probably // break this. if (CallerF->hasFnAttribute("interrupt")) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector RVLocs1; ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, *DAG.getContext(), Call); CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); SmallVector RVLocs2; ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, *DAG.getContext(), Call); CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); if (RVLocs1.size() != RVLocs2.size()) return false; for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) return false; if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) return false; if (RVLocs1[i].isRegLoc()) { if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; } else { if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; } } } // If Caller's vararg or byval argument has been split between registers and // stack, do not perform tail call, since part of the argument is in caller's // local frame. const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). getInfo(); if (AFI_Caller->getArgRegsSaveSize()) return false; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(), Call); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC, false, isVarArg)); if (CCInfo.getNextStackOffset()) { MachineFunction &MF = DAG.getMachineFunction(); // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (VA.needsCustom()) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure // out what to do about this. if (!VA.isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) return false; if (RegVT == MVT::v2f64) { if (!ArgLocs[++i].isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) return false; } } else if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } } return true; } bool ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, isVarArg)); } static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, SDLoc DL, SelectionDAG &DAG) { const MachineFunction &MF = DAG.getMachineFunction(); const Function *F = MF.getFunction(); StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset // version of the "preferred return address". These offsets affect the return // instruction if this is a return from PL1 without hypervisor extensions. // IRQ/FIQ: +4 "subs pc, lr, #4" // SWI: 0 "subs pc, lr, #0" // ABORT: +4 "subs pc, lr, #4" // UNDEF: +4/+2 "subs pc, lr, #0" // UNDEF varies depending on where the exception came from ARM or Thumb // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. int64_t LROffset; if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || IntKind == "ABORT") LROffset = 4; else if (IntKind == "SWI" || IntKind == "UNDEF") LROffset = 0; else report_fatal_error("Unsupported interrupt attribute. If present, value " "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, DL, MVT::i32, false)); return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); } SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, SDLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; // CCState - Info about the registers and stack slots. ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext(), Call); // Analyze outgoing return values. CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, isVarArg)); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) bool isLittleEndian = Subtarget->isLittle(); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); AFI->setReturnRegsCount(RVLocs.size()); // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc // Extract the 2nd half and fall through to handle it as an f64 value. Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, dl, MVT::i32)); } // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is // available. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); } else Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (ARM::GPRRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i32)); else if (ARM::DPRRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } // Update chain and glue. RetOps[0] = Chain; if (Flag.getNode()) RetOps.push_back(Flag); // CPUs which aren't M-class use a special sequence to return from // exceptions (roughly, any instruction setting pc and cpsr simultaneously, // though we use "subs pc, lr, #N"). // // M-class CPUs actually use a normal return sequence with a special // (hardware-provided) value in LR, so the normal code path works. if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && !Subtarget->isMClass()) { if (Subtarget->isThumb1Only()) report_fatal_error("interrupt attribute is not supported in Thumb1"); return LowerInterruptReturn(RetOps, dl, DAG); } return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); } bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { SDNode *VMov = Copy; // f64 returned in a pair of GPRs. SmallPtrSet Copies; for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ISD::CopyToReg) return false; Copies.insert(*UI); } if (Copies.size() > 2) return false; for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { SDValue UseChain = UI->getOperand(0); if (Copies.count(UseChain.getNode())) // Second CopyToReg Copy = *UI; else { // We are at the top of this chain. // If the copy has a glue operand, we conservatively assume it // isn't safe to perform a tail call. if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) return false; // First CopyToReg TCChain = UseChain; } } } else if (Copy->getOpcode() == ISD::BITCAST) { // f32 returned in a single GPR. if (!Copy->hasOneUse()) return false; Copy = *Copy->use_begin(); if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) return false; // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else { return false; } bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ARMISD::RET_FLAG && UI->getOpcode() != ARMISD::INTRET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; return true; } // Trying to write a 64 bit value so need to split into two 32 bit values first, // and pass the lower and high parts through. static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue WriteValue = Op->getOperand(2); // This function is only supposed to be called for i64 type argument. assert(WriteValue.getValueType() == MVT::i64 && "LowerWRITE_REGISTER called for non-i64 type argument."); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, DAG.getConstant(0, DL, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, DAG.getConstant(1, DL, MVT::i32)); SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOVi. static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here SDLoc dl(Op); ConstantPoolSDNode *CP = cast(Op); SDValue Res; if (CP->isMachineConstantPoolEntry()) Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlignment()); else Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment()); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } unsigned ARMTargetLowering::getJumpTableEncoding() const { return MachineJumpTableInfo::EK_Inline; } SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; SDLoc DL(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; if (RelocM == Reloc::Static) { CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); if (RelocM == Reloc::Static) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); } /// \brief Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address for Darwin, and return an /// SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i32] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first word, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "r0". /// /// Since this descriptor may be in a different unit, in general access must /// proceed along the usual ARM rules. A common sequence to produce is: /// /// movw rT1, :lower16:_var$non_lazy_ptr /// movt rT1, :upper16:_var$non_lazy_ptr /// ldr r0, [rT1] /// ldr rT2, [r0] /// blx rT2 /// [...address now in r0...] SDValue ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); SDLoc DL(Op); // First step is to get the address of the actua global symbol. This is where // the TLS descriptor lives. SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad(MVT::i32, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, true, true, 4); Chain = FuncTLVGet.getValue(1); MachineFunction &F = DAG.getMachineFunction(); MachineFrameInfo *MFI = F.getFrameInfo(); MFI->setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be // silly). auto TRI = getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo(); auto ARI = static_cast(TRI); const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: r0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); Chain = DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { SDLoc dl(GA); EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); // call __tls_get_addr. ArgListTy Args; ArgListEntry Entry; Entry.Node = Argument; Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); Args.push_back(Entry); // FIXME: is there useful debug info available here? TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } // Lower ISD::GlobalTLSAddress using the "initial exec" or // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); if (model == TLSModel::InitialExec) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); // Initial exec model. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); } else { // local exec model assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetDarwin()) return LowerGlobalTLSAddressDarwin(Op, DAG); // TODO: implement the "local dynamic" model assert(Subtarget->isTargetELF() && "Only ELF implemented here"); GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); switch (model) { case TLSModel::GeneralDynamic: case TLSModel::LocalDynamic: return LowerToTLSGeneralDynamicModel(GA, DAG); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModels(GA, DAG, model); } llvm_unreachable("bogus TLS model"); } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { bool UseGOT_PREL = !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, /*AddCurrentAddress=*/UseGOT_PREL); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); SDValue Chain = Result.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); if (UseGOT_PREL) Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. if (Subtarget->useMovt(DAG.getMachineFunction())) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); } } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); if (Subtarget->useMovt(DAG.getMachineFunction())) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into multiple nodes unsigned Wrapper = RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); assert(Subtarget->useMovt(DAG.getMachineFunction()) && "Windows on ARM expects to use movw/movt"); const GlobalValue *GV = cast(Op)->getGlobal(); const ARMII::TOF TargetFlags = (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, TargetFlags)); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Val = DAG.getConstant(0, dl, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1), Val); } SDValue ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); } SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, Op.getOperand(0)); } SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::arm_rbit: { assert(Op.getOperand(1).getValueType() == MVT::i32 && "RBIT intrinsic must have i32 type!"); return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); } case Intrinsic::arm_thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb() ? 4 : 8); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 0); if (RelocM == Reloc::PIC_) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } return Result; } case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) ? ARMISD::VMULLs : ARMISD::VMULLu; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vminnm: case Intrinsic::arm_neon_vmaxnm: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) ? ISD::FMINNUM : ISD::FMAXNUM; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vminu: case Intrinsic::arm_neon_vmaxu: { if (Op.getValueType().isFloatingPoint()) return SDValue(); unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) ? ISD::UMIN : ISD::UMAX; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vmins: case Intrinsic::arm_neon_vmaxs: { // v{min,max}s is overloaded between signed integers and floats. if (!Op.getValueType().isFloatingPoint()) { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) ? ISD::SMIN : ISD::SMAX; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) ? ISD::FMINNAN : ISD::FMAXNAN; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } } } static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // FIXME: handle "fence singlethread" more efficiently. SDLoc dl(Op); if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(0, dl, MVT::i32)); } ConstantSDNode *OrdN = cast(Op.getOperand(1)); AtomicOrdering Ord = static_cast(OrdN->getZExtValue()); ARM_MB::MemBOpt Domain = ARM_MB::ISH; if (Subtarget->isMClass()) { // Only a full system barrier exists in the M-class architectures. Domain = ARM_MB::SY; } else if (Subtarget->isSwift() && Ord == Release) { // Swift happens to implement ISHST barriers in a way that's compatible with // Release semantics but weaker than ISH so we'd be fools not to use // it. Beware: other processors probably don't! Domain = ARM_MB::ISHST; } return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), DAG.getConstant(Domain, dl, MVT::i32)); } static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // ARM pre v5TE and Thumb1 does not have preload instructions. if (!(Subtarget->isThumb2() || (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) // Just preserve the chain. return Op.getOperand(0); SDLoc dl(Op); unsigned isRead = ~cast(Op.getOperand(2))->getZExtValue() & 1; if (!isRead && (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) // ARMv7 with MP extension has PLDW. return Op.getOperand(0); unsigned isData = cast(Op.getOperand(4))->getZExtValue(); if (Subtarget->isThumb()) { // Invert the bits. isRead = ~isRead & 1; isData = ~isData & 1; } return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), DAG.getConstant(isData, dl, MVT::i32)); } static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *FuncInfo = MF.getInfo(); // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc dl(Op); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); } SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, SDLoc dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) RC = &ARM::tGPRRegClass; else RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; if (NextVA.isMemLoc()) { MachineFrameInfo *MFI = MF.getFrameInfo(); int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); ArgValue2 = DAG.getLoad( MVT::i32, dl, Root, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, false, 0); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); } if (!Subtarget->isLittle()) std::swap (ArgValue, ArgValue2); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } // The remaining GPRs hold either the beginning of variable-argument // data, or the beginning of an aggregate passed by value (usually // byval). Either way, we allocate stack slots adjacent to the data // provided by our caller, and store the unallocated registers there. // If this is a variadic function, the va_list pointer will begin with // these values; otherwise, this reassembles a (byval) structure that // was split between registers and memory. // Return: The frame index registers were stored into. int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, int ArgOffset, unsigned ArgSize) const { // Currently, two use-cases possible: // Case #1. Non-var-args function, and we meet first byval parameter. // Setup first unallocated register as first byval register; // eat all remained registers // (these two actions are performed by HandleByVal method). // Then, here, we initialize stack frame with // "store-reg" instructions. // Case #2. Var-args function, that doesn't contain byval parameters. // The same: eat all remained unallocated registers, // initialize stack frame. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned RBegin, REnd; if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); } else { unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; REnd = ARM::R4; } if (REnd != RBegin) ArgOffset = -4 * (ARM::R4 - RBegin); auto PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); SmallVector MemOps; const TargetRegisterClass *RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { unsigned VReg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return FrameIndex; } // Setup stack frame, the va_list pointer will start from. void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); // Try to store any remaining integer argument regs // to their spots on the stack so that they may be loaded by deferencing // the result of va_next. // If there is no regs to be stored, just point address after last // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(), CCInfo.getNextStackOffset(), 4); AFI->setVarArgsFrameIndex(FrameIndex); } SDValue ARMTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(), Prologue); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); SmallVector ArgValues; SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; // Initially ArgRegsSaveSize is zero. // Then we increase this value each time we meet byval parameter. // We also increase this value in case of varargs function. AFI->setArgRegsSaveSize(0); // Calculate the amount of stack space that we need to allocate to store // byval and variadic arguments that are passed in registers. // We need to know this before we allocate the first byval or variadic // argument, as they will be allocated a stack slot below the CFA (Canonical // Frame Address, the stack pointer at entry to the function). unsigned ArgRegBegin = ARM::R4; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) break; CCValAssign &VA = ArgLocs[i]; unsigned Index = VA.getValNo(); ISD::ArgFlagsTy Flags = Ins[Index].Flags; if (!Flags.isByVal()) continue; assert(VA.isMemLoc() && "unexpected byval pointer in reg"); unsigned RBegin, REnd; CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); ArgRegBegin = std::min(ArgRegBegin, RBegin); CCInfo.nextInRegsParam(); } CCInfo.rewindByValRegsInfo(); int lastInsIndex = -1; if (isVarArg && MFI->hasVAStart()) { unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); if (RegIdx != array_lengthof(GPRArgRegs)) ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); auto PtrVT = getPointerTy(DAG.getDataLayout()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (Ins[VA.getValNo()].isOrigArg()) { std::advance(CurOrigArg, Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); } // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); if (VA.needsCustom()) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. if (VA.getLocVT() == MVT::v2f64) { SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); VA = ArgLocs[++i]; // skip ahead to next loc SDValue ArgValue2; if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgValue2 = DAG.getLoad( MVT::f64, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, false, 0); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue1, DAG.getIntPtrConstant(0, dl)); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue2, DAG.getIntPtrConstant(1, dl)); } else ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) RC = &ARM::DPRRegClass; else if (RegVT == MVT::v2f64) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); break; case CCValAssign::ZExt: ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); break; } InVals.push_back(ArgValue); } else { // VA.isRegLoc() // sanity check assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); int index = VA.getValNo(); // Some Ins[] entries become multiple ArgLoc[] entries. // Process them only once. if (index != lastInsIndex) { ISD::ArgFlagsTy Flags = Ins[index].Flags; // FIXME: For now, all byval parameter objects are marked mutable. // This can be changed with more analysis. // In case of tail call optimization mark all arguments mutable. // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { assert(Ins[index].isOrigArg() && "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); int FrameIndex = StoreByValRegs( CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, VA.getLocMemOffset(), Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { unsigned FIOffset = VA.getLocMemOffset(); int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, FIOffset, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(DAG.getLoad( VA.getValVT(), dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, false, 0)); } lastInsIndex = index; } } } // varargs if (isVarArg && MFI->hasVAStart()) VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), TotalArgRegsSaveSize); AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); return Chain; } /// isFloatingPointZero - Return true if this is +0.0. static bool isFloatingPointZero(SDValue Op) { if (ConstantFPSDNode *CFP = dyn_cast(Op)) return CFP->getValueAPF().isPosZero(); else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { // Maybe this has already been legalized into the constant pool? if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { SDValue WrapperOp = Op.getOperand(1).getOperand(0); if (ConstantPoolSDNode *CP = dyn_cast(WrapperOp)) if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isPosZero(); } } else if (Op->getOpcode() == ISD::BITCAST && Op->getValueType(0) == MVT::f64) { // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) // created by LowerConstantFP(). SDValue BitcastOp = Op->getOperand(0); if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && isNullConstant(BitcastOp->getOperand(0))) return true; } return false; } /// Returns appropriate ARM CMP (cmp) and corresponding condition code for /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { // Constant does not fit, try adjusting it by one? switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETULT: case ISD::SETUGE: if (C != 0 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETLE: case ISD::SETGT: if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; case ISD::SETULE: case ISD::SETUGT: if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; } } } ARMCC::CondCodes CondCode = IntCCToARMCC(CC); ARMISD::NodeType CompareType; switch (CondCode) { default: CompareType = ARMISD::CMP; break; case ARMCC::EQ: case ARMCC::NE: // Uses only Z Flag CompareType = ARMISD::CMPZ; break; } ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, SDLoc dl) const { assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); SDValue Cmp; if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); else Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } /// duplicateCmp - Glue values can have only one use, so this function /// duplicates a comparison node. SDValue ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { unsigned Opc = Cmp.getOpcode(); SDLoc DL(Cmp); if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); Cmp = Cmp.getOperand(0); Opc = Cmp.getOpcode(); if (Opc == ARMISD::CMPFP) Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); else { assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); } return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } std::pair ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const { assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); SDValue Value, OverflowCmp; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDLoc dl(Op); // FIXME: We are currently always generating CMPs because we don't support // generating CMN through the backend. This is not as good as the natural // CMP case because it causes a register dependency and cannot be folded // later. switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::UADDO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::SSUBO: ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; case ISD::USUBO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; } // switch (...) return std::make_pair(Value, OverflowCmp); } SDValue ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDLoc dl(Op); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); EVT VT = Op.getValueType(); SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, CCR, OverflowCmp); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); SDValue SelectFalse = Op.getOperand(2); SDLoc dl(Op); unsigned Opc = Cond.getOpcode(); if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO)) { if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) return SDValue(); SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); EVT VT = Op.getValueType(); return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, OverflowCmp, DAG); } // Convert: // // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) // if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { const ConstantSDNode *CMOVTrue = dyn_cast(Cond.getOperand(0)); const ConstantSDNode *CMOVFalse = dyn_cast(Cond.getOperand(1)); if (CMOVTrue && CMOVFalse) { unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); SDValue True; SDValue False; if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { True = SelectTrue; False = SelectFalse; } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { True = SelectFalse; False = SelectTrue; } if (True.getNode() && False.getNode()) { EVT VT = Op.getValueType(); SDValue ARMcc = Cond.getOperand(2); SDValue CCR = Cond.getOperand(3); SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); assert(True.getValueType() == VT); return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); } } } // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the // undefined bits before doing a full-word comparison with zero. Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, DAG.getConstant(1, dl, Cond.getValueType())); return DAG.getSelectCC(dl, Cond, DAG.getConstant(0, dl, Cond.getValueType()), SelectTrue, SelectFalse, ISD::SETNE); } static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps) { // Start by selecting the GE condition code for opcodes that return true for // 'equality' if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || CC == ISD::SETULE) CondCode = ARMCC::GE; // and GT for opcodes that return false for 'equality'. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || CC == ISD::SETULT) CondCode = ARMCC::GT; // Since we are constrained to GE/GT, if the opcode contains 'less', we need // to swap the compare operands. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || CC == ISD::SETULT) swpCmpOps = true; // Both GT and GE are ordered comparisons, and return false for 'unordered'. // If we have an unordered opcode, we need to swap the operands to the VSEL // instruction (effectively negating the condition). // // This also has the effect of swapping which one of 'less' or 'greater' // returns true, so we also swap the compare operands. It also switches // whether we return true for 'equality', so we compensate by picking the // opposite condition code to our original choice. if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || CC == ISD::SETUGT) { swpCmpOps = !swpCmpOps; swpVselOps = !swpVselOps; CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; } // 'ordered' is 'anything but unordered', so use the VS condition code and // swap the VSEL operands. if (CC == ISD::SETO) { CondCode = ARMCC::VS; swpVselOps = true; } // 'unordered or not equal' is 'anything but equal', so use the EQ condition // code and swap the VSEL operands. if (CC == ISD::SETUNE) { CondCode = ARMCC::EQ; swpVselOps = true; } } SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const { if (Subtarget->isFPOnlySP() && VT == MVT::f64) { FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), FalseVal); TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), TrueVal); SDValue TrueLow = TrueVal.getValue(0); SDValue TrueHigh = TrueVal.getValue(1); SDValue FalseLow = FalseVal.getValue(0); SDValue FalseHigh = FalseVal.getValue(1); SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, ARMcc, CCR, Cmp); SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, ARMcc, CCR, duplicateCmp(Cmp, DAG)); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); } else { return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); } } SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); SDLoc dl(Op); if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } if (LHS.getValueType() == MVT::i32) { // Try to generate VSEL on ARMv8. // The VSEL instruction can't use all the usual ARM condition // codes: it only has two bits to select the condition code, so it's // constrained to use only GE, GT, VS and EQ. // // To implement all the various ISD::SETXXX opcodes, we sometimes need to // swap the operands of the previous compare instruction (effectively // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { CC = ISD::getSetCCInverse(CC, true); std::swap(TrueVal, FalseVal); } } SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); // Try to generate VMAXNM/VMINNM on ARMv8. if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { if (swpCmpOps) std::swap(LHS, RHS); if (swpVselOps) std::swap(TrueVal, FalseVal); } } SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); } return Result; } /// canChangeToInt - Given the fp compare operand, return true if it is suitable /// to morph to an integer compare sequence. static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget) { SDNode *N = Op.getNode(); if (!N->hasOneUse()) // Otherwise it requires moving the value from fp to integer registers. return false; if (!N->getNumValues()) return false; EVT VT = Op.getValueType(); if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) // f32 case is generally profitable. f64 case only makes sense when vcmpe + // vmrs are very slow, e.g. cortex-a8. return false; if (isFloatingPointZero(Op)) { SeenZero = true; return true; } return ISD::isNormalLoad(N); } static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { if (isFloatingPointZero(Op)) return DAG.getConstant(0, SDLoc(Op), MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); llvm_unreachable("Unknown VFP cmp argument!"); } static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2) { SDLoc dl(Op); if (isFloatingPointZero(Op)) { RetVal1 = DAG.getConstant(0, dl, MVT::i32); RetVal2 = DAG.getConstant(0, dl, MVT::i32); return; } if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); RetVal1 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); SDValue NewPtr = DAG.getNode(ISD::ADD, dl, PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), NewAlign); return; } llvm_unreachable("Unknown VFP cmp argument!"); } /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some /// f32 and even f64 comparisons to integer ones. SDValue ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); bool LHSSeenZero = false; bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); bool RHSSeenZero = false; bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { // If unsafe fp math optimization is enabled and there are no other uses of // the CMP operands, and the condition code is EQ or NE, we can optimize it // to an integer comparison. if (CC == ISD::SETOEQ) CC = ISD::SETEQ; else if (CC == ISD::SETUNE) CC = ISD::SETNE; SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); SDValue ARMcc; if (LHS.getValueType() == MVT::f32) { LHS = DAG.getNode(ISD::AND, dl, MVT::i32, bitcastf32Toi32(LHS, DAG), Mask); RHS = DAG.getNode(ISD::AND, dl, MVT::i32, bitcastf32Toi32(RHS, DAG), Mask); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, Cmp); } SDValue LHS1, LHS2; SDValue RHS1, RHS2; expandf64Toi32(LHS, DAG, LHS1, LHS2); expandf64Toi32(RHS, DAG, RHS1, RHS2); LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); ARMCC::CondCodes CondCode = IntCCToARMCC(CC); ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); } return SDValue(); } SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, Cmp); } assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); if (getTargetMachine().Options.UnsafeFPMath && (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE || CC == ISD::SETUNE)) { SDValue Result = OptimizeVFPBrcond(Op, DAG); if (Result.getNode()) return Result; } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); if (CondCode2 != ARMCC::AL) { ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); } return Res; } SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); SDLoc dl(Op); EVT PTy = getPointerTy(DAG.getDataLayout()); JumpTableSDNode *JT = cast(Table); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); if (Subtarget->isThumb2()) { // Thumb2 uses a two-level jump. That is, it jumps into the jump table // which does another jump to the destination. This also makes it easier // to translate it to TBB / TBH later. // FIXME: This might not work if the function is extremely large. return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, Addr, Op.getOperand(2), JTI); } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), false, false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), false, false, false, 0); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); if (Op.getValueType().getVectorElementType() == MVT::i32) { if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) return Op; return DAG.UnrollVectorOp(Op.getNode()); } assert(Op.getOperand(0).getValueType() == MVT::v4f32 && "Invalid type for custom lowering!"); if (VT != MVT::v4i16) return DAG.UnrollVectorOp(Op.getNode()); Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); } SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } return Op; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { if (VT.getVectorElementType() == MVT::f32) return Op; return DAG.UnrollVectorOp(Op.getNode()); } assert(Op.getOperand(0).getValueType() == MVT::v4i16 && "Invalid type for custom lowering!"); if (VT != MVT::v4f32) return DAG.UnrollVectorOp(Op.getNode()); unsigned CastOpc; unsigned Opc; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid opcode!"); case ISD::SINT_TO_FP: CastOpc = ISD::SIGN_EXTEND; Opc = ISD::SINT_TO_FP; break; case ISD::UINT_TO_FP: CastOpc = ISD::ZERO_EXTEND; Opc = ISD::UINT_TO_FP; break; } Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); return DAG.getNode(Opc, dl, VT, Op); } SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } return Op; } SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR; bool UseNEON = !InGPR && Subtarget->hasNEON(); if (UseNEON) { // Use VBSL to copy the sign bit. unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); if (VT == MVT::f32) { Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, DAG.getConstant(0, dl, MVT::i32)); } else { Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); } return Res; } // Bitcast operand 1 to i32. if (SrcVT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Tmp1).getValue(1); Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); // Or in the signbit with integer operations. SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); if (VT == MVT::f32) { Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); } // f64: Or the high part with signbit and then combine two parts. Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Tmp0); SDValue Lo = Tmp0.getValue(0); SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); EVT VT = Op.getValueType(); SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(4, dl, MVT::i32); return DAG.getLoad(VT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Return LR, which contains the return address. Mark it an implicit live-in. unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { const ARMBaseRegisterInfo &ARI = *static_cast(RegInfo); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); unsigned FrameReg = ARI.getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), false, false, false, 0); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", ARM::SP) .Default(0); if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } // Result is 64 bit value so split into two 32 bit values and return as a // pair of values. static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); // This function is only supposed to be called for i64 type destination. assert(N->getValueType(0) == MVT::i64 && "ExpandREAD_REGISTER called for non-i64 type result."); SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), N->getOperand(0), N->getOperand(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), Read.getValue(1))); Results.push_back(Read.getOperand(0)); } /// \p BC is a bitcast that is about to be turned into a VMOVDRR. /// When \p DstVT, the destination type of \p BC, is on the vector /// register bank and the source of bitcast, \p Op, operates on the same bank, /// it might be possible to combine them, such that everything stays on the /// vector register bank. /// \p return The node that would replace \p BT, if the combine /// is possible. static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG) { SDValue Op = BC->getOperand(0); EVT DstVT = BC->getValueType(0); // The only vector instruction that can produce a scalar (remember, // since the bitcast was about to be turned into VMOVDRR, the source // type is i64) from a vector is EXTRACT_VECTOR_ELT. // Moreover, we can do this combine only if there is one use. // Finally, if the destination type is not a vector, there is not // much point on forcing everything on the vector bank. if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !Op.hasOneUse()) return SDValue(); // If the index is not constant, we will introduce an additional // multiply that will stick. // Give up in that case. ConstantSDNode *Index = dyn_cast(Op.getOperand(1)); if (!Index) return SDValue(); unsigned DstNumElt = DstVT.getVectorNumElements(); // Compute the new index. const APInt &APIntIndex = Index->getAPIntValue(); APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); NewIndex *= APIntIndex; // Check if the new constant index fits into i32. if (NewIndex.getBitWidth() > 32) return SDValue(); // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) SDLoc dl(Op); SDValue ExtractSrc = Op.getOperand(0); EVT VecVT = EVT::getVectorVT( *DAG.getContext(), DstVT.getScalarType(), ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); } /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); // This function is only supposed to be called for i64 types, either as the // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && "ExpandBITCAST called for non-i64 type"); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { // Do not force values to GPRs (this is what VMOVDRR does for the inputs) // if we can combine the bitcast with its source. if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) return Val; SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, DstVT, DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { SDValue Cvt; if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && SrcVT.getVectorNumElements() > 1) Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); else Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Op); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); } return SDValue(); } /// getZeroVector - Returns a vector of specified type with all zero elements. /// Zero vectors are used to represent vector negation and in those cases /// will be implemented with the NEON VNEG instruction. However, VNEG does /// not support i64 elements, so sometimes the zero vectors will need to be /// explicitly constructed. Regardless, use a canonical VMOV to create the /// zero vector. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, DAG.getConstant(VTBits, dl, MVT::i32)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, DAG.getConstant(VTBits, dl, MVT::i32)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDLoc dl(N); EVT VT = N->getValueType(0); if (VT.isVector()) { assert(ST->hasNEON()); // Compute the least significant set bit: LSB = X & -X SDValue X = N->getOperand(0); SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); EVT ElemTy = VT.getVectorElementType(); if (ElemTy == MVT::i8) { // Compute with: cttz(x) = ctpop(lsb - 1) SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(1, dl, ElemTy)); SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 unsigned NumBits = ElemTy.getSizeInBits(); SDValue WidthMinus1 = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); } // Compute with: cttz(x) = ctpop(lsb - 1) // Since we can only compute the number of bits in a byte with vcnt.8, we // have to gather the result with pairwise addition (vpaddl) for i16, i32, // and i64. // Compute LSB - 1. SDValue Bits; if (ElemTy == MVT::i64) { // Load constant 0xffff'ffff'ffff'ffff to register. SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(0x1eff, dl, MVT::i32)); Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); } else { SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(1, dl, ElemTy)); Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); } // Count #bits with vcnt.8. EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); // Gather the #bits with vpaddl (pairwise add.) EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), Cnt8); if (ElemTy == MVT::i16) return Cnt16; EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), Cnt16); if (ElemTy == MVT::i32) return Cnt32; assert(ElemTy == MVT::i64); SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), Cnt32); return Cnt64; } if (!ST->hasV6T2Ops()) return SDValue(); SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count /// for each 16-bit element from operand, repeated. The basic idea is to /// leverage vcnt to get the 8-bit counts, gather and add the results. /// /// Trace for v4i16: /// input = [v0 v1 v2 v3 ] (vi 16-bit element) /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] /// [b0 b1 b2 b3 b4 b5 b6 b7] /// +[b1 b0 b3 b2 b5 b4 b7 b6] /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDLoc DL(N); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); } /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the /// bit-count for each 16-bit element from the operand. We need slightly /// different sequencing for v4i16 and v8i16 to stay within NEON's available /// 64/128-bit registers. /// /// Trace for v4i16: /// input = [v0 v1 v2 v3 ] (vi 16-bit element) /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] /// v4i16:Extracted = [k0 k1 k2 k3 ] static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDLoc DL(N); SDValue BitCounts = getCTPOP16BitCounts(N, DAG); if (VT.is64BitVector()) { SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, DAG.getIntPtrConstant(0, DL)); } else { SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, BitCounts, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); } } /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the /// bit-count for each 32-bit element from the operand. The idea here is /// to split the vector into 16-bit elements, leverage the 16-bit count /// routine, and then combine the results. /// /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): /// input = [v0 v1 ] (vi: 32-bit elements) /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) /// vrev: N0 = [k1 k0 k3 k2 ] /// [k0 k1 k2 k3 ] /// N1 =+[k1 k0 k3 k2 ] /// [k0 k2 k1 k3 ] /// N2 =+[k1 k3 k0 k2 ] /// [k0 k2 k1 k3 ] /// Extended =+[k1 k3 k0 k2 ] /// [k0 k2 ] /// Extracted=+[k1 k3 ] /// static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDLoc DL(N); EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); if (VT.is64BitVector()) { SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, DAG.getIntPtrConstant(0, DL)); } else { SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); } } static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && "Unexpected type for custom ctpop lowering"); if (VT.getVectorElementType() == MVT::i32) return lowerCTPOP32BitElements(N, DAG); else return lowerCTPOP16BitElements(N, DAG); } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); if (!VT.isVector()) return SDValue(); // Lower vector shifts on NEON to use VSHL. assert(ST->hasNEON() && "unexpected vector shift"); // Left shifts translate directly to the vshiftu intrinsic. if (N->getOpcode() == ISD::SHL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, MVT::i32), N->getOperand(0), N->getOperand(1)); assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); // NEON uses the same intrinsics for both left and right shifts. For // right shifts, the shift amounts are negative, so negate the vector of // shift amounts. EVT ShiftVT = N->getOperand(1).getValueType(); SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? Intrinsic::arm_neon_vshifts : Intrinsic::arm_neon_vshiftu); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(vshiftInt, dl, MVT::i32), N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) return SDValue(); assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && "Unknown shift to lower!"); // We only lower SRA, SRL of 1 here, all others use generic lowering. if (!isOneConstant(N->getOperand(1))) return SDValue(); // If we are in thumb mode, we don't have RRX. if (ST->isThumb1Only()) return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(1, dl, MVT::i32)); // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); // The low part is an ARMISD::RRX operand, which shifts the carry in. Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue TmpOp0, TmpOp1; bool Invert = false; bool Swap = false; unsigned Opc = 0; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); SDLoc dl(Op); if (CmpVT.getVectorElementType() == MVT::i64) // 64-bit comparisons are not legal. We've marked SETCC as non-Custom, // but it's possible that our operands are 64-bit but our result is 32-bit. // Bail in this case. return SDValue(); if (Op1.getValueType().isFloatingPoint()) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); case ISD::SETUNE: case ISD::SETNE: Invert = true; // Fallthrough case ISD::SETOEQ: case ISD::SETEQ: Opc = ARMISD::VCEQ; break; case ISD::SETOLT: case ISD::SETLT: Swap = true; // Fallthrough case ISD::SETOGT: case ISD::SETGT: Opc = ARMISD::VCGT; break; case ISD::SETOLE: case ISD::SETLE: Swap = true; // Fallthrough case ISD::SETOGE: case ISD::SETGE: Opc = ARMISD::VCGE; break; case ISD::SETUGE: Swap = true; // Fallthrough case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; case ISD::SETUGT: Swap = true; // Fallthrough case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; case ISD::SETUEQ: Invert = true; // Fallthrough case ISD::SETONE: // Expand this to (OLT | OGT). TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); break; case ISD::SETUO: Invert = true; // Fallthrough case ISD::SETO: // Expand this to (OLT | OGE). TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); break; } } else { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); case ISD::SETNE: Invert = true; case ISD::SETEQ: Opc = ARMISD::VCEQ; break; case ISD::SETLT: Swap = true; case ISD::SETGT: Opc = ARMISD::VCGT; break; case ISD::SETLE: Swap = true; case ISD::SETGE: Opc = ARMISD::VCGE; break; case ISD::SETULT: Swap = true; case ISD::SETUGT: Opc = ARMISD::VCGTU; break; case ISD::SETULE: Swap = true; case ISD::SETUGE: Opc = ARMISD::VCGEU; break; } // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). if (Opc == ARMISD::VCEQ) { SDValue AndOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) AndOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) AndOp = Op1; // Ignore bitconvert. if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Opc = ARMISD::VTST; Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); Invert = !Invert; } } } if (Swap) std::swap(Op0, Op1); // If one of the operands is a constant vector zero, attempt to fold the // comparison to a specialized compare-against-zero form. SDValue SingleOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) SingleOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { if (Opc == ARMISD::VCGE) Opc = ARMISD::VCLEZ; else if (Opc == ARMISD::VCGT) Opc = ARMISD::VCLTZ; SingleOp = Op1; } SDValue Result; if (SingleOp.getNode()) { switch (Opc) { case ARMISD::VCEQ: Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGE: Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLEZ: Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGT: Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLTZ: Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; default: Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } } else { Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } Result = DAG.getSExtOrTrunc(Result, dl, VT); if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } /// isNEONModifiedImm - Check if the specified splat value corresponds to a /// valid vector constant for a NEON instruction with a "modified immediate" /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, SDLoc dl, EVT &VT, bool is128Bits, NEONModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified // immediate instructions others than VMOV do not support the 8-bit encoding // of a zero vector, and the default encoding of zero is supposed to be the // 32-bit version. if (SplatBits == 0) SplatBitSize = 32; switch (SplatBitSize) { case 8: if (type != VMOVModImm) return SDValue(); // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); OpCmode = 0xe; Imm = SplatBits; VT = is128Bits ? MVT::v16i8 : MVT::v8i8; break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. VT = is128Bits ? MVT::v8i16 : MVT::v4i16; if ((SplatBits & ~0xff) == 0) { // Value = 0x00nn: Op=x, Cmode=100x. OpCmode = 0x8; Imm = SplatBits; break; } if ((SplatBits & ~0xff00) == 0) { // Value = 0xnn00: Op=x, Cmode=101x. OpCmode = 0xa; Imm = SplatBits >> 8; break; } return SDValue(); case 32: // NEON's 32-bit VMOV supports splat values where: // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. VT = is128Bits ? MVT::v4i32 : MVT::v2i32; if ((SplatBits & ~0xff) == 0) { // Value = 0x000000nn: Op=x, Cmode=000x. OpCmode = 0; Imm = SplatBits; break; } if ((SplatBits & ~0xff00) == 0) { // Value = 0x0000nn00: Op=x, Cmode=001x. OpCmode = 0x2; Imm = SplatBits >> 8; break; } if ((SplatBits & ~0xff0000) == 0) { // Value = 0x00nn0000: Op=x, Cmode=010x. OpCmode = 0x4; Imm = SplatBits >> 16; break; } if ((SplatBits & ~0xff000000) == 0) { // Value = 0xnn000000: Op=x, Cmode=011x. OpCmode = 0x6; Imm = SplatBits >> 24; break; } // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC if (type == OtherModImm) return SDValue(); if ((SplatBits & ~0xffff) == 0 && ((SplatBits | SplatUndef) & 0xff) == 0xff) { // Value = 0x0000nnff: Op=x, Cmode=1100. OpCmode = 0xc; Imm = SplatBits >> 8; break; } if ((SplatBits & ~0xffffff) == 0 && ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { // Value = 0x00nnffff: Op=x, Cmode=1101. OpCmode = 0xd; Imm = SplatBits >> 16; break; } // Note: there are a few 32-bit splat values (specifically: 00ffff00, // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not // VMOV.I32. A (very) minor optimization would be to replicate the value // and fall through here to test for a valid 64-bit splat. But, then the // caller would also need to check and handle the change in size. return SDValue(); case 64: { if (type != VMOVModImm) return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; uint64_t Val = 0; unsigned ImmMask = 1; Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { if (((SplatBits | SplatUndef) & BitMask) == BitMask) { Val |= BitMask; Imm |= ImmMask; } else if ((SplatBits & BitMask) != 0) { return SDValue(); } BitMask <<= 8; ImmMask <<= 1; } if (DAG.getDataLayout().isBigEndian()) // swap higher and lower 32 bit word Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); // Op=1, Cmode=1110. OpCmode = 0x1e; VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } default: llvm_unreachable("unexpected size for isNEONModifiedImm"); } unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); } SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { if (!ST->hasVFP3()) return SDValue(); bool IsDouble = Op.getValueType() == MVT::f64; ConstantFPSDNode *CFP = cast(Op); // Use the default (constant pool) lowering for double constants when we have // an SP-only FPU if (IsDouble && Subtarget->isFPOnlySP()) return SDValue(); // Try splatting with a VMOV.f32... APFloat FPVal = CFP->getValueAPF(); int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); if (ImmVal != -1) { if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { // We have code in place to select a valid ConstantFP already, no need to // do any mangling. return Op; } // It's a float and we are trying to use NEON operations where // possible. Lower it to a splat followed by an extract. SDLoc DL(Op); SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, DAG.getConstant(0, DL, MVT::i32)); } // The rest of our options are NEON only, make sure that's allowed before // proceeding.. if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) return SDValue(); EVT VMovVT; uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); // It wouldn't really be worth bothering for doubles except for one very // important value, which does happen to match: 0.0. So make sure we don't do // anything stupid. if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) return SDValue(); // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, NewVal); if (IsDouble) return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, DAG.getConstant(0, DL, MVT::i32)); } // Finally, try a VMVN.i32 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); if (IsDouble) return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, DAG.getConstant(0, DL, MVT::i32)); } return SDValue(); } // check if an VEXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonVEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } static bool isVEXTMask(ArrayRef M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); ReverseVEXT = false; // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, it may still be // a VEXT but the source vectors must be swapped. ExpectedElt += 1; if (ExpectedElt == NumElts * 2) { ExpectedElt = 0; ReverseVEXT = true; } if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } // Adjust the index value if the source operands will be swapped. if (ReverseVEXT) Imm -= NumElts; return true; } /// isVREVMask - Check if a vector shuffle corresponds to a VREV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isVREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && "Only possible block sizes for VREV are: 16, 32, 64"); unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) return false; } return true; } static bool isVTBLMask(ArrayRef M, EVT VT) { // We can handle <8 x i8> vector shuffles. If the index in the mask is out of // range, then 0 is placed into the resulting vector. So pretty much any mask // of 8 elements can work here. return VT == MVT::v8i8 && M.size() == 8; } // Checks whether the shuffle mask represents a vector transpose (VTRN) by // checking that pairs of elements in the shuffle mask represent the same index // in each vector, incrementing the expected index by 2 at each step. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} // v2={e,f,g,h} // WhichResult gives the offset for each element in the mask based on which // of the two results it belongs to. // // The transpose can be represented either as: // result1 = shufflevector v1, v2, result1_shuffle_mask // result2 = shufflevector v1, v2, result2_shuffle_mask // where v1/v2 and the shuffle masks have the same number of elements // (here WhichResult (see below) indicates which result is being checked) // // or as: // results = shufflevector v1, v2, shuffle_mask // where both results are returned in one vector and the shuffle mask has twice // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we // want to check the low half and high half of the shuffle mask as if it were // the other case static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; // If the mask is twice as long as the input vector then we need to check the // upper and lower parts of the mask with a matching value for WhichResult // FIXME: A mask with only even values will be rejected in case the first // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only // M[0] is used to determine WhichResult for (unsigned i = 0; i < M.size(); i += NumElts) { if (M.size() == NumElts * 2) WhichResult = i / NumElts; else WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) return false; } } if (M.size() == NumElts*2) WhichResult = 0; return true; } /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { if (M.size() == NumElts * 2) WhichResult = i / NumElts; else WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) return false; } } if (M.size() == NumElts*2) WhichResult = 0; return true; } // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking // that the mask elements are either all even and in steps of size 2 or all odd // and in steps of size 2. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} // v2={e,f,g,h} // Requires similar checks to that of isVTRNMask with // respect the how results are returned. static bool isVUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; ++j) { if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) return false; } } if (M.size() == NumElts*2) WhichResult = 0; // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; unsigned Half = NumElts / 2; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += Half) { unsigned Idx = WhichResult; for (unsigned k = 0; k < Half; ++k) { int MIdx = M[i + j + k]; if (MIdx >= 0 && (unsigned) MIdx != Idx) return false; Idx += 2; } } } if (M.size() == NumElts*2) WhichResult = 0; // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } // Checks whether the shuffle mask represents a vector zip (VZIP) by checking // that pairs of elements of the shufflemask represent the same index in each // vector incrementing sequentially through the vectors. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} // v2={e,f,g,h} // Requires similar checks to that of isVTRNMask with respect the how results // are returned. static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) return false; Idx += 1; } } if (M.size() == NumElts*2) WhichResult = 0; // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) return false; Idx += 1; } } if (M.size() == NumElts*2) WhichResult = 0; // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. static unsigned isNEONTwoResultShuffleMask(ArrayRef ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF) { isV_UNDEF = false; if (isVTRNMask(ShuffleMask, VT, WhichResult)) return ARMISD::VTRN; if (isVUZPMask(ShuffleMask, VT, WhichResult)) return ARMISD::VUZP; if (isVZIPMask(ShuffleMask, VT, WhichResult)) return ARMISD::VZIP; isV_UNDEF = true; if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VTRN; if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VUZP; if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VZIP; return 0; } /// \return true if this is a reverse operation on an vector. static bool isReverseMask(ArrayRef M, EVT VT) { unsigned NumElts = VT.getVectorNumElements(); // Make sure the mask has the right size. if (NumElts != M.size()) return false; // Look for <15, ..., 3, -1, 1, 0>. for (unsigned i = 0; i != NumElts; ++i) if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) return false; return true; } // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, SDLoc dl) { uint64_t Val; if (!isa(N)) return SDValue(); Val = cast(N)->getZExtValue(); if (ST->isThumb1Only()) { if (Val <= 255 || ~Val <= 255) return DAG.getConstant(Val, dl, MVT::i32); } else { if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) return DAG.getConstant(Val, dl, MVT::i32); } return SDValue(); } // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast(Op.getNode()); SDLoc dl(Op); EVT VT = Op.getValueType(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { // Check if an immediate VMOV works. EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isNEONModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Use vmov.f32 to materialize other v2f32 and v4f32 splats. if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { int ImmVal = ARM_AM::getFP32Imm(SplatBits); if (ImmVal != -1) { SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); } } } } // Scan through the operands to see if only one value is used. // // As an optimisation, even if more than one value is used it may be more // profitable to splat with one value then change some lanes. // // Heuristically we decide to do this if the vector has a "dominant" value, // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool hasDominantValue = false; bool isConstant = true; // Map of the number of times a particular SDValue appears in the // element list. DenseMap ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) isConstant = false; ValueCounts.insert(std::make_pair(V, 0)); unsigned &Count = ValueCounts[V]; // Is this value dominant? (takes up more than half of the lanes) if (++Count > (NumElts / 2)) { hasDominantValue = true; Value = V; } } if (ValueCounts.size() != 1) usesOnlyOneValue = false; if (!Value.getNode() && ValueCounts.size() > 0) Value = ValueCounts.begin()->first; if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. // Keep going if we are hitting this case. if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (hasDominantValue && EltSize <= 32) { if (!isConstant) { SDValue N; // If we are VDUPing a value that comes directly from a vector, that will // cause an unnecessary move to and from a GPR, where instead we could // just use VDUPLANE. We can only do this if the lane being extracted // is at a constant index, as the VDUP from lane instructions only have // constant-index forms. ConstantSDNode *constIndex; if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && (constIndex = dyn_cast(Value->getOperand(1)))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element // such that the register coalescer will remove unnecessary copies. if (VT != Value->getOperand(0).getValueType()) { unsigned index = constIndex->getAPIntValue().getLimitedValue() % VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), Value, DAG.getConstant(index, dl, MVT::i32)), DAG.getConstant(index, dl, MVT::i32)); } else N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, Value->getOperand(0), Value->getOperand(1)); } else N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); if (!usesOnlyOneValue) { // The dominant value was splatted as 'N', but we now have to insert // all differing elements. for (unsigned I = 0; I < NumElts; ++I) { if (Op.getOperand(I) == Value) continue; SmallVector Ops; Ops.push_back(N); Ops.push_back(Op.getOperand(I)); Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); } } return N; } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } if (usesOnlyOneValue) { SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); if (isConstant && Val.getNode()) return DAG.getNode(ARMISD::VDUP, dl, VT, Val); } } // If all elements are constants and the case above didn't get hit, fall back // to the default expansion, which will generate a load from the constant // pool. if (isConstant) return SDValue(); // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { SDValue shuffle = ReconstructShuffle(Op, DAG); if (shuffle != SDValue()) return shuffle; } // Vectors with 32- or 64-bit elements can be built by directly assigning // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands // will be legalized. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); for (unsigned i = 0 ; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } return SDValue(); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt; unsigned MaxElt; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase; int WindowScale; bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } ShuffleSourceInfo(SDValue Vec) : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); } else if (!isa(V.getOperand(1))) { // Furthermore, shuffles require a constant mask, whereas extractelts // accept variable indices. return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors // are involved. if (Sources.size() > 2) return SDValue(); // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) SmallestEltTy = SrcEltTy; } unsigned ResMultiplier = VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) return SDValue(); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i32)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i32)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i32)); Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Src.MinElt, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final sanity check before we try to actually produce a shuffle. DEBUG( for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT); ); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.getOpcode() == ISD::UNDEF) continue; auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getVectorElementType().getSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) return SDValue(); // We can't handle more than two sources. This should have already // been checked before this point. assert(Sources.size() <= 2 && "Too many sources!"); SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], &Mask[0]); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (M[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = M[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return true; } bool ReverseVEXT, isV_UNDEF; unsigned Imm, WhichResult; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); return (EltSize >= 32 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || isVEXTMask(M, VT, ReverseVEXT, Imm) || isVTBLMask(M, VT) || isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, SDLoc dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR // VTRN, right result }; if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); // vrev <4 x i16> -> VREV32 if (VT.getVectorElementType() == MVT::i16) return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); // vrev <4 x i8> -> VREV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: return DAG.getNode(ARMISD::VDUPLANE, dl, VT, OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: return DAG.getNode(ARMISD::VEXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); case OP_VUZPL: case OP_VUZPR: return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); case OP_VZIPL: case OP_VZIPR: return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); case OP_VTRNL: case OP_VTRNR: return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); } } static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the VTBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); SmallVector VTBLMask; for (ArrayRef::iterator I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); if (V2.getNode()->getOpcode() == ISD::UNDEF) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); } static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue OpLHS = Op.getOperand(0); EVT VT = OpLHS.getValueType(); assert((VT == MVT::v8i16 || VT == MVT::v16i8) && "Expect an v8i16/v16i8 type"); OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, // extract the first 8 bytes into the top double word and the last 8 bytes // into the bottom double word. The v8i16 case is similar. unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, DAG.getConstant(ExtractNum, DL, MVT::i32)); } static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. // FIXME: floating-point vectors should be canonicalized to integer vectors // of the same time so that they get CSEd properly. ArrayRef ShuffleMask = SVN->getMask(); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize <= 32) { if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; // Test if V1 is a SCALAR_TO_VECTOR. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR // (and probably will turn into a SCALAR_TO_VECTOR once legalization // reaches it). if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(0))) { bool IsScalarToVector = true; for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { IsScalarToVector = false; break; } if (IsScalarToVector) return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i32)); } bool ReverseVEXT; unsigned Imm; if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } if (isVREVMask(ShuffleMask, VT, 64)) return DAG.getNode(ARMISD::VREV64, dl, VT, V1); if (isVREVMask(ShuffleMask, VT, 32)) return DAG.getNode(ARMISD::VREV32, dl, VT, V1); if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); if (V2->getOpcode() == ISD::UNDEF && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } // Check for Neon shuffles that modify both input vectors in place. // If both results are used, i.e., if there are two shuffles with the same // source operands and with masks corresponding to both results of one of // these operations, DAG memoization will ensure that a single node is // used for both shuffles. unsigned WhichResult; bool isV_UNDEF; if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( ShuffleMask, VT, WhichResult, isV_UNDEF)) { if (isV_UNDEF) V2 = V1; return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) .getValue(WhichResult); } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize // shuffles that produce a result larger than their operands with: // shuffle(concat(v1, undef), concat(v2, undef)) // -> // shuffle(concat(v1, v2), undef) // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). // // This is useful in the general case, but there are special cases where // native shuffles produce larger results: the two-result ops. // // Look through the concat when lowering them: // shuffle(concat(v1, v2), undef) // -> // concat(VZIP(v1, v2):0, :1) // if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->getOpcode() == ISD::UNDEF) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); // We expect these to have been canonicalized to -1. assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { return i < (int)VT.getVectorNumElements(); }) && "Unexpected shuffle index into UNDEF operand!"); if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { if (isV_UNDEF) SubV2 = SubV1; assert((WhichResult == 0) && "In-place shuffle of concat can only have one result!"); SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), SubV1, SubV2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), Res.getValue(1)); } } } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) { if (ShuffleMask[i] < 0) Ops.push_back(DAG.getUNDEF(EltVT)); else Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShuffleMask[i] < (int)NumElts ? V1 : V2, DAG.getConstant(ShuffleMask[i] & (NumElts-1), dl, MVT::i32))); } SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); } if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); if (VT == MVT::v8i8) { SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); if (NewOp.getNode()) return NewOp; } return SDValue(); } static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa(Lane)) return SDValue(); return Op; } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); if (!isa(Lane)) return SDValue(); SDValue Vec = Op.getOperand(0); if (Op.getValueType() == MVT::i32 && Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); } return Op; } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && "unexpected CONCAT_VECTORS"); SDLoc dl(Op); SDValue Val = DAG.getUNDEF(MVT::v2f64); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), DAG.getIntPtrConstant(0, dl)); if (Op1.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), DAG.getIntPtrConstant(1, dl)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. EVT VT = N->getValueType(0); if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { SDNode *BVN = N->getOperand(0).getNode(); if (BVN->getValueType(0) != MVT::v4i32 || BVN->getOpcode() != ISD::BUILD_VECTOR) return false; unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; unsigned HiElt = 1 - LoElt; ConstantSDNode *Lo0 = dyn_cast(BVN->getOperand(LoElt)); ConstantSDNode *Hi0 = dyn_cast(BVN->getOperand(HiElt)); ConstantSDNode *Lo1 = dyn_cast(BVN->getOperand(LoElt+2)); ConstantSDNode *Hi1 = dyn_cast(BVN->getOperand(HiElt+2)); if (!Lo0 || !Hi0 || !Lo1 || !Hi1) return false; if (isSigned) { if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) return true; } else { if (Hi0->isNullValue() && Hi1->isNullValue()) return true; } return false; } if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *Elt = N->getOperand(i).getNode(); if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } /// isSignExtended - Check if a node is a vector value that is sign-extended /// or a constant BUILD_VECTOR with sign-extended elements. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, true)) return true; return false; } /// isZeroExtended - Check if a node is a vector value that is zero-extended /// or a constant BUILD_VECTOR with zero-extended elements. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, false)) return true; return false; } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. /// We insert the required extension here to get the vector to fill a D register. static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } /// SkipLoadExtensionForVMULL - return a load of the original vector size that /// does not do any sign/zero extension. If the original vector is less /// than 64 bits, an appropriate extension will be added after the load to /// reach a total size of 64 bits. We have to add the extension separately /// because ARM does not have a sign/zero extending load for vectors. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); // The load already has the right type. if (ExtendedTy == LD->getMemoryVT()) return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), LD->getAlignment()); // We need to create a zextload/sextload. We cannot just create a load // followed by a zext/zext node because LowerMUL is also run during normal // operation legalization where we can't create illegal types. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(), LD->isNonTemporal(), LD->getAlignment()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, /// extending load, or BUILD_VECTOR with extended elements, return the /// unextended value. The unextended vector should be 64 bits so that it can /// be used as an operand to a VMULL instruction. If the original vector size /// before extension is less than 64 bits we add a an extension to resize /// the vector to 64 bits. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); if (LoadSDNode *LD = dyn_cast(N)) return SkipLoadExtensionForVMULL(LD, DAG); // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will // have been legalized as a BITCAST from v4i32. if (N->getOpcode() == ISD::BITCAST) { SDNode *BVN = N->getOperand(0).getNode(); assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); } // Construct a new BUILD_VECTOR with elements truncated to half the size. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; SDLoc dl(N); for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::getVectorVT(TruncVT, NumElts), Ops); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); if (isN0SExt && isN1SExt) NewOpc = ARMISD::VMULLs; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); if (isN0ZExt && isN1ZExt) NewOpc = ARMISD::VMULLu; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (isN1SExt && isAddSubSExt(N0, DAG)) { NewOpc = ARMISD::VMULLs; isMLA = true; } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { NewOpc = ARMISD::VMULLu; isMLA = true; } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); NewOpc = ARMISD::VMULLu; isMLA = true; } } if (!NewOpc) { if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); else // Other vector multiplications are legal. return Op; } } // Legalize to a VMULL instruction. SDLoc DL(Op); SDValue Op0; SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { Op0 = SkipExtensionForVMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during // isel lowering to take advantage of no-stall back to back vmul + vmla. // vmull q0, d4, d6 // vmlal q0, d5, d6 // is faster than // vaddl q0, d4, d5 // vmovl q1, d6 // vmul q0, q0, q1 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); // Get reciprocal estimate. // float4 recip = vrecpeq_f32(yf); Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), Y); // Because char has a smaller range than uchar, we can actually get away // without any newton steps. This requires that we use a weird bias // of 0xb000, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0xb000); X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); Y = DAG.getConstant(0xb000, dl, MVT::i32); Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); // Convert back to short. X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); return X; } static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); // float4 xf = vcvt_f32_s32(vmovl_s16(x)); N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and one refinement step. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), N1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Because short has a smaller range than ushort, we can actually get away // with only a single newton step. This requires that we use a weird bias // of 89, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0x89); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(0x89, dl, MVT::i32); N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. // return vmovn_s32(vcvt_s32_f32(result)); N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); return N0; } static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; } return LowerSDIV_v4i16(N0, N1, dl, DAG); } static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, MVT::i32), N0); return N0; } // v4i16 sdiv ... Convert to float. // float4 yf = vcvt_f32_s32(vmovl_u16(y)); // float4 xf = vcvt_f32_s32(vmovl_u16(x)); N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and two refinement steps. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), BN1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Simply multiplying by the reciprocal estimate can leave us a few ulps // too low, so we add 2 ulps (exhaustive testing shows that this is enough, // and that it will never cause us to return an answer too large). // float4 result = as_float4(as_int4(xf*recip) + 2); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(2, dl, MVT::i32); N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. // return vmovn_u32(vcvt_s32_f32(result)); N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); return N0; } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getNode()->getValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = ARMISD::ADDC; break; case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; case ISD::SUBC: Opc = ARMISD::SUBC; break; case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; } if (!ExtraOp) return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin()); // For iOS, we want to call an alternative entry point: __sincos_stret, // return values are passed via sret. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); auto &DL = DAG.getDataLayout(); ArgListTy Args; bool ShouldUseSRet = Subtarget->isAPCS_ABI(); SDValue SRet; if (ShouldUseSRet) { // Create stack object for sret. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); ArgListEntry Entry; Entry.Node = SRet; Entry.Ty = RetTy->getPointerTo(); Entry.isSExt = false; Entry.isZExt = false; Entry.isSRet = true; Args.push_back(Entry); RetTy = Type::getVoidTy(*DAG.getContext()); } ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); const char *LibcallName = (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; RTLIB::Libcall LC = (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setCallee(CC, RetTy, Callee, std::move(Args), 0) .setDiscardResult(ShouldUseSRet); std::pair CallResult = LowerCallTo(CLI); if (!ShouldUseSRet) return CallResult.first; SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo(), false, false, false, 0); // Address of cos field. SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo(), false, false, false, 0); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), LoadCos.getValue(0)); } SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { EVT VT = Op.getValueType(); assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected type for custom lowering DIV"); SDLoc dl(Op); const auto &DL = DAG.getDataLayout(); const auto &TLI = DAG.getTargetLoweringInfo(); const char *Name = nullptr; if (Signed) Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; else Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); ARMTargetLowering::ArgListTy Args; for (auto AI : {1, 0}) { ArgListEntry Arg; Arg.Node = Op.getOperand(AI); Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); Args.push_back(Arg); } CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args), 0); return LowerCallTo(CLI).first; } SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const { assert(Op.getValueType() == MVT::i32 && "unexpected type for custom lowering DIV"); SDLoc dl(Op); SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Op.getOperand(1)); return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); } void ARMTargetLowering::ExpandDIV_Windows( SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl &Results) const { const auto &DL = DAG.getDataLayout(); const auto &TLI = DAG.getTargetLoweringInfo(); assert(Op.getValueType() == MVT::i64 && "unexpected type for custom lowering DIV"); SDLoc dl(Op); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), DAG.getConstant(1, dl, MVT::i32)); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, DAG.getConstant(32, dl, TLI.getPointerTy(DL))); Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); Results.push_back(Lower); Results.push_back(Upper); } static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { // Monotonic load/store is legal for all targets if (cast(Op)->getOrdering() <= Monotonic) return Op; // Acquire/Release load/store is not legal for targets without a // dmb or equivalent available. return SDValue(); } static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc DL(N); // Under Power Management extensions, the cycle-count is: // mrc p15, #0, , c9, c13, #0 SDValue Ops[] = { N->getOperand(0), // Chain DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), DAG.getConstant(15, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(9, DL, MVT::i32), DAG.getConstant(13, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32) }; SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, DAG.getConstant(0, DL, MVT::i32))); Results.push_back(Cycles32.getValue(1)); } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: switch (Subtarget->getTargetTriple().getObjectFormat()) { default: llvm_unreachable("unknown object format"); case Triple::COFF: return LowerGlobalAddressWindows(Op, DAG); case Triple::ELF: return LowerGlobalAddressELF(Op, DAG); case Triple::MachO: return LowerGlobalAddressDarwin(Op, DAG); } case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); case ISD::SREM: return LowerREM(Op.getNode(), DAG); case ISD::UREM: return LowerREM(Op.getNode(), DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: return LowerSDIV(Op, DAG); case ISD::UDIV: return LowerUDIV(Op, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: return LowerXALUO(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) return LowerDYNAMIC_STACKALLOC(Op, DAG); llvm_unreachable("Don't know how to custom lower this!"); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); } } /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); case ISD::READ_REGISTER: ExpandREAD_REGISTER(N, Results, DAG); break; case ISD::BITCAST: Res = ExpandBITCAST(N, DAG); break; case ISD::SRL: case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; case ISD::SREM: case ISD::UREM: Res = LowerREM(N, DAG); break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; case ISD::UDIV: case ISD::SDIV: assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, Results); } if (Res.getNode()) Results.push_back(Res); } //===----------------------------------------------------------------------===// // ARM Scheduler Hooks //===----------------------------------------------------------------------===// /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and /// registers the function context. void ARMTargetLowering:: SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineConstantPool *MCP = MF->getConstantPool(); ARMFunctionInfo *AFI = MF->getInfo(); const Function *F = MF->getFunction(); bool isThumb = Subtarget->isThumb(); bool isThumb2 = Subtarget->isThumb2(); unsigned PCLabelId = AFI->createPICLabelUId(); unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { // Incoming value: jbuf // ldr.n r5, LCPI1_1 // orr r5, r5, #1 // add r5, pc // str r5, [$jbuf, #+4] ; &jbuf[1] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO)); // Set the low bit because of thumb mode. unsigned NewVReg2 = MRI->createVirtualRegister(TRC); AddDefaultCC( AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(0x01))); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) .addReg(NewVReg2, RegState::Kill) .addImm(PCLabelId); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) .addReg(NewVReg3, RegState::Kill) .addFrameIndex(FI) .addImm(36) // &jbuf[1] :: pc .addMemOperand(FIMMOSt)); } else if (isThumb) { // Incoming value: jbuf // ldr.n r1, LCPI1_4 // add r1, pc // mov r2, #1 // orrs r1, r2 // add r2, $jbuf, #+4 ; &jbuf[1] // str r1, [r2] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO)); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId); // Set the low bit because of thumb mode. unsigned NewVReg3 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) .addReg(ARM::CPSR, RegState::Define) .addImm(1)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3, RegState::Kill)); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) .addFrameIndex(FI) .addImm(36); // &jbuf[1] :: pc AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg5, RegState::Kill) .addImm(0) .addMemOperand(FIMMOSt)); } else { // Incoming value: jbuf // ldr r1, LCPI1_1 // add r1, pc, r1 // str r1, [$jbuf, #+4] ; &jbuf[1] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) .addConstantPoolIndex(CPI) .addImm(0) .addMemOperand(CPMMO)); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId)); AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) .addReg(NewVReg2, RegState::Kill) .addFrameIndex(FI) .addImm(36) // &jbuf[1] :: pc .addMemOperand(FIMMOSt)); } } void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineFrameInfo *MFI = MF->getFrameInfo(); int FI = MFI->getFunctionContextIndex(); const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass : &ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap > CallSiteNumToLPad; unsigned MaxCSNum = 0; MachineModuleInfo &MMI = MF->getMMI(); for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. for (MachineBasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { if (!II->isEHLabel()) continue; MCSymbol *Sym = II->getOperand(0).getMCSymbol(); if (!MMI.hasCallSiteLandingPad(Sym)) continue; SmallVectorImpl &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym); for (SmallVectorImpl::iterator CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); CSI != CSE; ++CSI) { CallSiteNumToLPad[*CSI].push_back(&*BB); MaxCSNum = std::max(MaxCSNum, *CSI); } break; } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned I = 1; I <= MaxCSNum; ++I) { SmallVectorImpl &MBBList = CallSiteNumToLPad[I]; for (SmallVectorImpl::iterator II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { LPadList.push_back(*II); InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // Create the jump table and associated information. MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); unsigned MJTI = JTI->createJumpTableIndex(LPadList); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); // Create the MBBs for the dispatch code. // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); unsigned trap_opcode; if (Subtarget->isThumb()) trap_opcode = ARM::tTRAP; else trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; BuildMI(TrapBB, dl, TII->get(trap_opcode)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert and MBBs. MF->insert(MF->end(), DispatchBB); MF->insert(MF->end(), DispContBB); MF->insert(MF->end(), TrapBB); // Insert code into the entry block that creates and registers the function // context. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); const ARMBaseInstrInfo *AII = static_cast(TII); const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. MIB.addRegMask(RI.getNoPreservedMask()); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) .addMemOperand(FIMMOLd)); if (NumLPads < 256) { AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) .addReg(NewVReg1) .addImm(LPadList.size())); } else { unsigned VReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) .addImm(NumLPads & 0xFFFF)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) .addReg(VReg1) .addImm(NumLPads >> 16)); } AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) .addReg(NewVReg1) .addReg(VReg2)); } BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) .addJumpTableIndex(MJTI)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultCC( AddDefaultPred( BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg1) .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) .addFrameIndex(FI) .addImm(1) .addMemOperand(FIMMOLd)); if (NumLPads < 256) { AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) .addReg(NewVReg1) .addImm(NumLPads)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx)); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) .addReg(NewVReg1) .addReg(VReg1)); } BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg1) .addImm(2)); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) .addReg(NewVReg4, RegState::Kill) .addImm(0) .addMemOperand(JTMMOLd)); unsigned NewVReg6 = NewVReg5; if (RelocM == Reloc::PIC_) { NewVReg6 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg3)); } BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) .addReg(NewVReg6, RegState::Kill) .addJumpTableIndex(MJTI); } else { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) .addMemOperand(FIMMOLd)); if (NumLPads < 256) { AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) .addReg(NewVReg1) .addImm(NumLPads)); } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { unsigned VReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) .addImm(NumLPads & 0xFFFF)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) .addReg(VReg1) .addImm(NumLPads >> 16)); } AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) .addReg(NewVReg1) .addReg(VReg2)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0)); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) .addReg(NewVReg1) .addReg(VReg1, RegState::Kill)); } BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); AddDefaultCC( AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred( BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg4) .addImm(0) .addMemOperand(JTMMOLd)); if (RelocM == Reloc::PIC_) { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg4) .addJumpTableIndex(MJTI); } else { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) .addReg(NewVReg5, RegState::Kill) .addJumpTableIndex(MJTI); } } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (std::vector::iterator I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { MachineBasicBlock *CurMBB = *I; if (SeenMBBs.insert(CurMBB).second) DispContBB->addSuccessor(CurMBB); } // N.B. the order the invoke BBs are processed in doesn't matter here. const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); SmallVector MBBLPads; for (MachineBasicBlock *BB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. SmallVector Successors(BB->succ_begin(), BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); if (SMBB->isEHPad()) { BB->removeSuccessor(SMBB); MBBLPads.push_back(SMBB); } } BB->addSuccessor(DispatchBB, BranchProbability::getZero()); BB->normalizeSuccProbs(); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (MachineBasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { if (!II->isCall()) continue; DenseMap DefRegs; for (MachineInstr::mop_iterator OI = II->operands_begin(), OE = II->operands_end(); OI != OE; ++OI) { if (!OI->isReg()) continue; DefRegs[OI->getReg()] = true; } MachineInstrBuilder MIB(*MF, &*II); for (unsigned i = 0; SavedRegs[i] != 0; ++i) { unsigned Reg = SavedRegs[i]; if (Subtarget->isThumb2() && !ARM::tGPRRegClass.contains(Reg) && !ARM::hGPRRegClass.contains(Reg)) continue; if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) continue; if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) continue; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (SmallVectorImpl::iterator I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) (*I)->setIsEHPad(false); // The instruction is gone now. MI->eraseFromParent(); } static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) if (*I != Succ) return *I; llvm_unreachable("Expecting a BB with two successors!"); } /// Return the load opcode for a given load size. If load size >= 8, /// neon opcode will be returned. static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { if (LdSize >= 8) return LdSize == 16 ? ARM::VLD1q32wb_fixed : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; if (IsThumb1) return LdSize == 4 ? ARM::tLDRi : LdSize == 2 ? ARM::tLDRHi : LdSize == 1 ? ARM::tLDRBi : 0; if (IsThumb2) return LdSize == 4 ? ARM::t2LDR_POST : LdSize == 2 ? ARM::t2LDRH_POST : LdSize == 1 ? ARM::t2LDRB_POST : 0; return LdSize == 4 ? ARM::LDR_POST_IMM : LdSize == 2 ? ARM::LDRH_POST : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; } /// Return the store opcode for a given store size. If store size >= 8, /// neon opcode will be returned. static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { if (StSize >= 8) return StSize == 16 ? ARM::VST1q32wb_fixed : StSize == 8 ? ARM::VST1d32wb_fixed : 0; if (IsThumb1) return StSize == 4 ? ARM::tSTRi : StSize == 2 ? ARM::tSTRHi : StSize == 1 ? ARM::tSTRBi : 0; if (IsThumb2) return StSize == 4 ? ARM::t2STR_POST : StSize == 2 ? ARM::t2STRH_POST : StSize == 1 ? ARM::t2STRB_POST : 0; return StSize == 4 ? ARM::STR_POST_IMM : StSize == 2 ? ARM::STRH_POST : StSize == 1 ? ARM::STRB_POST_IMM : 0; } /// Emit a post-increment load operation with given size. The instructions /// will be added to BB at Pos. static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, const TargetInstrInfo *TII, DebugLoc dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); assert(LdOpc != 0 && "Should have a load opcode"); if (LdSize >= 8) { AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define).addReg(AddrIn) .addImm(0)); } else if (IsThumb1) { // load + update AddrIn AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrIn).addImm(0)); MachineInstrBuilder MIB = BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); MIB = AddDefaultT1CC(MIB); MIB.addReg(AddrIn).addImm(LdSize); AddDefaultPred(MIB); } else if (IsThumb2) { AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define).addReg(AddrIn) .addImm(LdSize)); } else { // arm AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define).addReg(AddrIn) .addReg(0).addImm(LdSize)); } } /// Emit a post-increment store operation with given size. The instructions /// will be added to BB at Pos. static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, const TargetInstrInfo *TII, DebugLoc dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); assert(StOpc != 0 && "Should have a store opcode"); if (StSize >= 8) { AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(AddrIn).addImm(0).addReg(Data)); } else if (IsThumb1) { // store + update AddrIn AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) .addReg(AddrIn).addImm(0)); MachineInstrBuilder MIB = BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); MIB = AddDefaultT1CC(MIB); MIB.addReg(AddrIn).addImm(StSize); AddDefaultPred(MIB); } else if (IsThumb2) { AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(Data).addReg(AddrIn).addImm(StSize)); } else { // arm AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(Data).addReg(AddrIn).addReg(0) .addImm(StSize)); } } MachineBasicBlock * ARMTargetLowering::EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned src = MI->getOperand(1).getReg(); unsigned SizeVal = MI->getOperand(2).getImm(); unsigned Align = MI->getOperand(3).getImm(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UnitSize = 0; const TargetRegisterClass *TRC = nullptr; const TargetRegisterClass *VecTRC = nullptr; bool IsThumb1 = Subtarget->isThumb1Only(); bool IsThumb2 = Subtarget->isThumb2(); if (Align & 1) { UnitSize = 1; } else if (Align & 2) { UnitSize = 2; } else { // Check whether we can use NEON instructions. if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; else if ((Align % 8 == 0) && SizeVal >= 8) UnitSize = 8; } // Can't use NEON instructions. if (UnitSize == 0) UnitSize = 4; } // Select the correct opcode and register class for unit size load/store bool IsNeon = UnitSize >= 8; TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass; if (IsNeon) VecTRC = UnitSize == 16 ? &ARM::DPairRegClass : UnitSize == 8 ? &ARM::DPRRegClass : nullptr; unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { // Use LDR and STR to copy. // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) // [destOut] = STR_POST(scratch, destIn, UnitSize) unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } // Handle the leftover bytes with LDRB and STRB. // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) for (unsigned i = 0; i < BytesLeft; i++) { unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); unsigned scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } MI->eraseFromParent(); // The instruction is gone now. return BB; } // Expand the pseudo op to a loop. // thisMBB: // ... // movw varEnd, # --> with thumb2 // movt varEnd, # // ldrcp varEnd, idx --> without thumb2 // fallthrough --> loopMBB // loopMBB: // PHI varPhi, varEnd, varLoop // PHI srcPhi, src, srcLoop // PHI destPhi, dst, destLoop // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSize) // subs varLoop, varPhi, #UnitSize // bne loopMBB // fallthrough --> exitMBB // exitMBB: // epilogue to handle left-over bytes // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, loopMBB); MF->insert(It, exitMBB); // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); if (Subtarget->useMovt(*MF)) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); AddDefaultPred(BuildMI(BB, dl, TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp).addImm(LoopSize & 0xFFFF)); if ((LoopSize & 0xFFFF0000) != 0) AddDefaultPred(BuildMI(BB, dl, TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) .addReg(Vtmp) .addImm(LoopSize >> 16)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); if (IsThumb1) AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( varEnd, RegState::Define).addConstantPoolIndex(Idx)); else AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); } BB->addSuccessor(loopMBB); // Generate the loop body: // varPhi = PHI(varLoop, varEnd) // srcPhi = PHI(srcLoop, src) // destPhi = PHI(destLoop, dst) MachineBasicBlock *entryBB = BB; BB = loopMBB; unsigned varLoop = MRI.createVirtualRegister(TRC); unsigned varPhi = MRI.createVirtualRegister(TRC); unsigned srcLoop = MRI.createVirtualRegister(TRC); unsigned srcPhi = MRI.createVirtualRegister(TRC); unsigned destLoop = MRI.createVirtualRegister(TRC); unsigned destPhi = MRI.createVirtualRegister(TRC); BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) .addReg(varLoop).addMBB(loopMBB) .addReg(varEnd).addMBB(entryBB); BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) .addReg(srcLoop).addMBB(loopMBB) .addReg(src).addMBB(entryBB); BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) .addReg(destLoop).addMBB(loopMBB) .addReg(dest).addMBB(entryBB); // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, IsThumb1, IsThumb2); emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, IsThumb1, IsThumb2); // Decrement loop variable by UnitSize. if (IsThumb1) { MachineInstrBuilder MIB = BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); MIB = AddDefaultT1CC(MIB); MIB.addReg(varPhi).addImm(UnitSize); AddDefaultPred(MIB); } else { MachineInstrBuilder MIB = BuildMI(*BB, BB->end(), dl, TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); MIB->getOperand(5).setReg(ARM::CPSR); MIB->getOperand(5).setIsDef(true); } BuildMI(*BB, BB->end(), dl, TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); // loopMBB can loop back to loopMBB or fall through to exitMBB. BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // Add epilogue to handle BytesLeft. BB = exitMBB; MachineInstr *StartOfExit = exitMBB->begin(); // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); unsigned scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } MI->eraseFromParent(); // The instruction is gone now. return BB; } MachineBasicBlock * ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock *MBB) const { const TargetMachine &TM = getTargetMachine(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetWindows() && "__chkstk is only supported on Windows"); assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); // __chkstk takes the number of words to allocate on the stack in R4, and // returns the stack adjustment in number of bytes in R4. This will not // clober any other registers (other than the obvious lr). // // Although, technically, IP should be considered a register which may be // clobbered, the call itself will not touch it. Windows on ARM is a pure // thumb-2 environment, so there is no interworking required. As a result, we // do not expect a veneer to be emitted by the linker, clobbering IP. // // Each module receives its own copy of __chkstk, so no import thunk is // required, again, ensuring that IP is not clobbered. // // Finally, although some linkers may theoretically provide a trampoline for // out of range calls (which is quite common due to a 32M range limitation of // branches for Thumb), we can generate the long-call version via // -mcmodel=large, alleviating the need for the trampoline which may clobber // IP. switch (TM.getCodeModel()) { case CodeModel::Small: case CodeModel::Medium: case CodeModel::Default: case CodeModel::Kernel: BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) .addImm((unsigned)ARMCC::AL).addReg(0) .addExternalSymbol("__chkstk") .addReg(ARM::R4, RegState::Implicit | RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Define) .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); break; case CodeModel::Large: case CodeModel::JITDefault: { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) .addImm((unsigned)ARMCC::AL).addReg(0) .addReg(Reg, RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Define) .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); break; } } AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) .addReg(ARM::SP).addReg(ARM::R4))); MI->eraseFromParent(); return MBB; } MachineBasicBlock * ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); MF->push_back(ContBB); ContBB->splice(ContBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); MBB->addSuccessor(ContBB); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); MF->push_back(TrapBB); BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); MBB->addSuccessor(TrapBB); BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) .addReg(MI->getOperand(0).getReg()) .addMBB(TrapBB); MI->eraseFromParent(); return ContBB; } MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI->getOpcode()) { default: { MI->dump(); llvm_unreachable("Unexpected instr type to insert"); } // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. case ARM::t2STR_preidx: MI->setDesc(TII->get(ARM::t2STR_PRE)); return BB; case ARM::t2STRB_preidx: MI->setDesc(TII->get(ARM::t2STRB_PRE)); return BB; case ARM::t2STRH_preidx: MI->setDesc(TII->get(ARM::t2STRH_PRE)); return BB; case ARM::STRi_preidx: case ARM::STRBi_preidx: { unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; // Decode the offset. unsigned Offset = MI->getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; Offset = ARM_AM::getAM2Offset(Offset); if (isSub) Offset = -Offset; MachineMemOperand *MMO = *MI->memoperands_begin(); BuildMI(*BB, MI, dl, TII->get(NewOpc)) .addOperand(MI->getOperand(0)) // Rn_wb .addOperand(MI->getOperand(1)) // Rt .addOperand(MI->getOperand(2)) // Rn .addImm(Offset) // offset (skip GPR==zero_reg) .addOperand(MI->getOperand(5)) // pred .addOperand(MI->getOperand(6)) .addMemOperand(MMO); MI->eraseFromParent(); return BB; } case ARM::STRr_preidx: case ARM::STRBr_preidx: case ARM::STRH_preidx: { unsigned NewOpc; switch (MI->getOpcode()) { default: llvm_unreachable("unexpected opcode!"); case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); for (unsigned i = 0; i < MI->getNumOperands(); ++i) MIB.addOperand(MI->getOperand(i)); MI->eraseFromParent(); return BB; } case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB BB = copy0MBB; // Update machine-CFG edges BB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } case ARM::BCCi64: case ARM::BCCZi64: { // If there is an unconditional branch to the other successor, remove it. BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); // Compare both parts that make up the double comparison separately for // equality. bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; unsigned LHS1 = MI->getOperand(1).getReg(); unsigned LHS2 = MI->getOperand(2).getReg(); if (RHSisZero) { AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS1).addImm(0)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { unsigned RHS1 = MI->getOperand(3).getReg(); unsigned RHS2 = MI->getOperand(4).getReg(); AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS1).addReg(RHS1)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS2).addReg(RHS2) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); if (MI->getOperand(0).getImm() == ARMCC::NE) std::swap(destMBB, exitMBB); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); if (isThumb2) AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); else BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } case ARM::Int_eh_sjlj_setjmp: case ARM::Int_eh_sjlj_setjmp_nofp: case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: return BB; case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; case ARM::ABS: case ARM::t2ABS: { // To insert an ABS instruction, we have to insert the // diamond control-flow pattern. The incoming instruction knows the // source vreg to test against 0, the destination vreg to set, // the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. // It transforms // V1 = ABS V0 // into // V2 = MOVS V0 // BCC (branch to SinkBB if V0 >= 0) // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) // SinkBB: V1 = PHI(V2, V3) const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator BBI = ++BB->getIterator(); MachineFunction *Fn = BB->getParent(); MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); Fn->insert(BBI, RSBBB); Fn->insert(BBI, SinkBB); unsigned int ABSSrcReg = MI->getOperand(1).getReg(); unsigned int ABSDstReg = MI->getOperand(0).getReg(); bool ABSSrcKIll = MI->getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); SinkBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RSBBB); BB->addSuccessor(SinkBB); // fall through to SinkMBB RSBBB->addSuccessor(SinkBB); // insert a cmp at the end of BB AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(ABSSrcReg).addImm(0)); // insert a bcc with opposite CC to ARMCC::MI at the end of BB BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); // insert rsbri in RSBBB // Note: BCC and rsbri will be converted into predicated rsbmi // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); // insert PHI in SinkBB, // reuse ABSDstReg to not change uses of ABS instruction BuildMI(*SinkBB, SinkBB->begin(), dl, TII->get(ARM::PHI), ABSDstReg) .addReg(NewRsbDstReg).addMBB(RSBBB) .addReg(ABSSrcReg).addMBB(BB); // remove ABS instruction MI->eraseFromParent(); // return last added BB return SinkBB; } case ARM::COPY_STRUCT_BYVAL_I32: ++NumLoopByVals; return EmitStructByval(MI, BB); case ARM::WIN__CHKSTK: return EmitLowered__chkstk(MI, BB); case ARM::WIN__DBZCHK: return EmitLowered__dbzchk(MI, BB); } } /// \brief Attaches vregs to MEMCPY that it will use as scratch registers /// when it is expanded into LDM/STM. This is done as a post-isel lowering /// instead of as a custom inserter because we need the use list from the SDNode. static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr *MI, const SDNode *Node) { bool isThumb1 = Subtarget->isThumb1Only(); DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineInstrBuilder MIB(*MF, MI); // If the new dst/src is unused mark it as dead. if (!Node->hasAnyUseOfValue(0)) { MI->getOperand(0).setIsDead(true); } if (!Node->hasAnyUseOfValue(1)) { MI->getOperand(1).setIsDead(true); } // The MEMCPY both defines and kills the scratch registers. for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { if (MI->getOpcode() == ARM::MEMCPY) { attachMEMCPYScratchRegs(Subtarget, MI, Node); return; } const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional // operand is still set to noreg. If needed, set the optional operand's // register to CPSR, and remove the redundant implicit def. // // e.g. ADCS (..., CPSR) -> ADC (... opt:CPSR). // Rename pseudo opcodes. unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); if (NewOpc) { const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && "converted opcode should be the same except for cc_out"); MI->setDesc(*MCID); // Add the optional cc_out operand MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); } unsigned ccOutIdx = MCID->getNumOperands() - 1; // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { assert(!NewOpc && "Optional cc_out operand required"); return; } // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it // since we already have an optional CPSR def. bool definesCPSR = false; bool deadCPSR = false; for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { definesCPSR = true; if (MO.isDead()) deadCPSR = true; MI->RemoveOperand(i); break; } } if (!definesCPSR) { assert(!NewOpc && "Optional cc_out operand required"); return; } assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); if (deadCPSR) { assert(!MI->getOperand(ccOutIdx).getReg() && "expect uninitialized optional cc_out operand"); return; } // If this instruction was defined with an optional CPSR def and its dag node // had a live implicit CPSR def, then activate the optional CPSR def. MachineOperand &MO = MI->getOperand(ccOutIdx); MO.setReg(ARM::CPSR); MO.setIsDef(true); } //===----------------------------------------------------------------------===// // ARM Optimization Hooks //===----------------------------------------------------------------------===// // Helper function that checks if N is a null or all ones constant. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); } // Return true if N is conditionally 0 or all ones. // Detects these expressions where cc is an i1 value: // // (select cc 0, y) [AllOnes=0] // (select cc y, 0) [AllOnes=0] // (zext cc) [AllOnes=0] // (sext cc) [AllOnes=0/1] // (select cc -1, y) [AllOnes=1] // (select cc y, -1) [AllOnes=1] // // Invert is set when N is the null/all ones constant when CC is false. // OtherOp is set to the alternative value of N. static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG) { switch (N->getOpcode()) { default: return false; case ISD::SELECT: { CC = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); if (isZeroOrAllOnes(N1, AllOnes)) { Invert = false; OtherOp = N2; return true; } if (isZeroOrAllOnes(N2, AllOnes)) { Invert = true; OtherOp = N1; return true; } return false; } case ISD::ZERO_EXTEND: // (zext cc) can never be the all ones value. if (AllOnes) return false; // Fall through. case ISD::SIGN_EXTEND: { SDLoc dl(N); EVT VT = N->getValueType(0); CC = N->getOperand(0); if (CC.getValueType() != MVT::i1) return false; Invert = !AllOnes; if (AllOnes) // When looking for an AllOnes constant, N is an sext, and the 'other' // value is 0. OtherOp = DAG.getConstant(0, dl, VT); else if (N->getOpcode() == ISD::ZERO_EXTEND) // When looking for a 0 constant, N can be zext or sext. OtherOp = DAG.getConstant(1, dl, VT); else OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT); return true; } } } // Combine a constant select operand into its use: // // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) // // The transform is rejected if the select doesn't have a constant operand that // is null, or all ones when AllOnes is set. // // Also recognize sext/zext from i1: // // (add (zext cc), x) -> (select cc (add x, 1), x) // (add (sext cc), x) -> (select cc (add x, -1), x) // // These transformations eventually create predicated instructions. // // @param N The node to transform. // @param Slct The N operand that is a select. // @param OtherOp The other N operand (x above). // @param DCI Context. // @param AllOnes Require the select constant to be all ones instead of null. // @returns The new node, or SDValue() on failure. static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes = false) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); SDValue NonConstantVal; SDValue CCOp; bool SwapSelectOps; if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, NonConstantVal, DAG)) return SDValue(); // Slct is now know to be the desired identity constant when CC is true. SDValue TrueVal = OtherOp; SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal); // Unless SwapSelectOps says CC should be false. if (SwapSelectOps) std::swap(TrueVal, FalseVal); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal); } // Attempt combineSelectAndUse on each operand of a commutative operator N. static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (N0.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes); if (Result.getNode()) return Result; } if (N1.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes); if (Result.getNode()) return Result; } return SDValue(); } // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction // (only after legalization). static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Only perform optimization if after legalize, and if NEON is available. We // also expected both operands to be BUILD_VECTORs. if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() || N0.getOpcode() != ISD::BUILD_VECTOR || N1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // Check output type since VPADDL operand elements can only be 8, 16, or 32. EVT VT = N->getValueType(0); if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) return SDValue(); // Check that the vector operands are of the right form. // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR // operands, where N is the size of the formed vector. // Each EXTRACT_VECTOR should have the same input vector and odd or even // index such that we have a pair wise add pattern. // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue Vec = N0->getOperand(0)->getOperand(0); SDNode *V = Vec.getNode(); unsigned nextIndex = 0; // For each operands to the ADD which are BUILD_VECTORs, // check to see if each of their operands are an EXTRACT_VECTOR with // the same vector and appropriate index. for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue ExtVec0 = N0->getOperand(i); SDValue ExtVec1 = N1->getOperand(i); // First operand is the vector, verify its the same. if (V != ExtVec0->getOperand(0).getNode() || V != ExtVec1->getOperand(0).getNode()) return SDValue(); // Second is the constant, verify its correct. ConstantSDNode *C0 = dyn_cast(ExtVec0->getOperand(1)); ConstantSDNode *C1 = dyn_cast(ExtVec1->getOperand(1)); // For the constant, we want to see all the even or all the odd. if (!C0 || !C1 || C0->getZExtValue() != nextIndex || C1->getZExtValue() != nextIndex+1) return SDValue(); // Increment index. nextIndex+=2; } else return SDValue(); } // Create VPADDL node. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); // Build operand list. SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, TLI.getPointerTy(DAG.getDataLayout()))); // Input is the vector. Ops.push_back(Vec); // Get widened type and narrowed type. MVT widenType; unsigned numElem = VT.getVectorNumElements(); EVT inputLaneType = Vec.getValueType().getVectorElementType(); switch (inputLaneType.getSimpleVT().SimpleTy) { case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; default: llvm_unreachable("Invalid vector element type for padd optimization."); } SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; return DAG.getNode(ExtOp, dl, VT, tmp); } static SDValue findMUL_LOHI(SDValue V) { if (V->getOpcode() == ISD::UMUL_LOHI || V->getOpcode() == ISD::SMUL_LOHI) return V; return SDValue(); } static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (Subtarget->isThumb1Only()) return SDValue(); // Only perform the checks after legalize when the pattern is available. if (DCI.isBeforeLegalize()) return SDValue(); // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is // a glue link from the first add to the second add. // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by // a S/UMLAL instruction. // UMUL_LOHI // / :lo \ :hi // / \ [no multiline comment] // loAdd -> ADDE | // \ :glue / // \ / // ADDC <- hiAdd // assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); SDValue AddcOp0 = AddcNode->getOperand(0); SDValue AddcOp1 = AddcNode->getOperand(1); // Check if the two operands are from the same mul_lohi node. if (AddcOp0.getNode() == AddcOp1.getNode()) return SDValue(); assert(AddcNode->getNumValues() == 2 && AddcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); // Check that we have a glued ADDC node. if (AddcNode->getValueType(1) != MVT::Glue) return SDValue(); // Check that the ADDC adds the low result of the S/UMUL_LOHI. if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && AddcOp0->getOpcode() != ISD::SMUL_LOHI && AddcOp1->getOpcode() != ISD::UMUL_LOHI && AddcOp1->getOpcode() != ISD::SMUL_LOHI) return SDValue(); // Look for the glued ADDE. SDNode* AddeNode = AddcNode->getGluedUser(); if (!AddeNode) return SDValue(); // Make sure it is really an ADDE. if (AddeNode->getOpcode() != ISD::ADDE) return SDValue(); assert(AddeNode->getNumOperands() == 3 && AddeNode->getOperand(2).getValueType() == MVT::Glue && "ADDE node has the wrong inputs"); // Check for the triangle shape. SDValue AddeOp0 = AddeNode->getOperand(0); SDValue AddeOp1 = AddeNode->getOperand(1); // Make sure that the ADDE operands are not coming from the same node. if (AddeOp0.getNode() == AddeOp1.getNode()) return SDValue(); // Find the MUL_LOHI node walking up ADDE's operands. bool IsLeftOperandMUL = false; SDValue MULOp = findMUL_LOHI(AddeOp0); if (MULOp == SDValue()) MULOp = findMUL_LOHI(AddeOp1); else IsLeftOperandMUL = true; if (MULOp == SDValue()) return SDValue(); // Figure out the right opcode. unsigned Opc = MULOp->getOpcode(); unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; // Figure out the high and low input values to the MLAL node. SDValue* HiAdd = nullptr; SDValue* LoMul = nullptr; SDValue* LowAdd = nullptr; // Ensure that ADDE is from high result of ISD::SMUL_LOHI. if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) return SDValue(); if (IsLeftOperandMUL) HiAdd = &AddeOp1; else HiAdd = &AddeOp0; // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node // whose low result is fed to the ADDC we are checking. if (AddcOp0 == MULOp.getValue(0)) { LoMul = &AddcOp0; LowAdd = &AddcOp1; } if (AddcOp1 == MULOp.getValue(0)) { LoMul = &AddcOp1; LowAdd = &AddcOp0; } if (!LoMul) return SDValue(); // Create the merged node. SelectionDAG &DAG = DCI.DAG; // Build operand list. SmallVector Ops; Ops.push_back(LoMul->getOperand(0)); Ops.push_back(LoMul->getOperand(1)); Ops.push_back(*LowAdd); Ops.push_back(*HiAdd); SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), DAG.getVTList(MVT::i32, MVT::i32), Ops); // Replace the ADDs' nodes uses by the MLA node's values. SDValue HiMLALResult(MLALNode.getNode(), 1); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); SDValue LoMLALResult(MLALNode.getNode(), 0); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); // Return original node to notify the driver to stop replacing. SDValue resNode(AddcNode, 0); return resNode; } /// PerformADDCCombine - Target-specific dag combine transform from /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. static SDValue PerformADDCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { return AddCombineTo64bitMLAL(N, DCI, Subtarget); } /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted /// operands. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget){ // Attempt to create vpaddl for this add. SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); if (Result.getNode()) return Result; // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) if (N0.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N0, N1, DCI); if (Result.getNode()) return Result; } return SDValue(); } /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // First try with the default operand order. SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); if (Result.getNode()) return Result; // If that didn't work, try again with the operands commuted. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); } /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. /// static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) if (N1.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N1, N0, DCI); if (Result.getNode()) return Result; } return SDValue(); } /// PerformVMULCombine /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the /// special multiplier accumulator forwarding. /// vmul d3, d0, d2 /// vmla d3, d1, d2 /// is faster than /// vadd d3, d0, d1 /// vmul d3, d3, d2 // However, for (A + B) * (A + B), // vadd d2, d0, d1 // vmul d3, d0, d2 // vmla d3, d1, d2 // is slower than // vadd d2, d0, d1 // vmul d3, d2, d2 static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (!Subtarget->hasVMLxForwarding()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) { Opcode = N1.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) return SDValue(); std::swap(N0, N1); } if (N0 == N1) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); return DAG.getNode(Opcode, DL, VT, DAG.getNode(ISD::MUL, DL, VT, N00, N1), DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); int64_t MulAmt = C->getSExtValue(); unsigned ShiftAmt = countTrailingZeros(MulAmt); ShiftAmt = ShiftAmt & (32 - 1); SDValue V = N->getOperand(0); SDLoc DL(N); SDValue Res; MulAmt >>= ShiftAmt; if (MulAmt >= 0) { if (isPowerOf2_32(MulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) Res = DAG.getNode(ISD::ADD, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmt - 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmt + 1), DL, MVT::i32)), V); } else return SDValue(); } else { uint64_t MulAmtAbs = -MulAmt; if (isPowerOf2_32(MulAmtAbs + 1)) { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) Res = DAG.getNode(ISD::SUB, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmtAbs - 1)) { // (mul x, -(2^N + 1)) => - (add (shl x, N), x) Res = DAG.getNode(ISD::ADD, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, MVT::i32))); Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, MVT::i32), Res); } else return SDValue(); } if (ShiftAmt != 0) Res = DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VbicVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); } } } if (!Subtarget->isThumb1Only()) { // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) SDValue Result = combineSelectAndUseCommutative(N, true, DCI); if (Result.getNode()) return Result; } return SDValue(); } /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VORR BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VorrVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VorrVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); } } } if (!Subtarget->isThumb1Only()) { // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) SDValue Result = combineSelectAndUseCommutative(N, false, DCI); if (Result.getNode()) return Result; } // The code below optimizes (or (and X, Y), Z). // The AND operand needs to have a single user to make these optimizations // profitable. SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); SDValue N1 = N->getOperand(1); // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { APInt SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; APInt SplatBits0, SplatBits1; BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); // Ensure that the second operand of both ands are constants if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, HasAnyUndefs) && !HasAnyUndefs) { if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, HasAnyUndefs) && !HasAnyUndefs) { // Ensure that the bit width of the constants are the same and that // the splat arguments are logical inverses as per the pattern we // are trying to simplify. if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && SplatBits0 == ~SplatBits1) { // Canonicalize the vector type to make instruction selection // simpler. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), N1->getOperand(0)); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } } } } // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. // BFI is only available on V6T2+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); SDLoc DL(N); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val // // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) // && mask == ~mask2 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) // && ~mask == mask2 // (i.e., copy a bitfield value into another bitfield of the same width) if (VT != MVT::i32) return SDValue(); SDValue N00 = N0.getOperand(0); // The value and the mask need to be constants so we can verify this is // actually a bitfield set. If the mask is 0xffff, we can do better // via a movt instruction, so don't use BFI in that case. SDValue MaskOp = N0.getOperand(1); ConstantSDNode *MaskC = dyn_cast(MaskOp); if (!MaskC) return SDValue(); unsigned Mask = MaskC->getZExtValue(); if (Mask == 0xffff) return SDValue(); SDValue Res; // Case (1): or (and A, mask), val => ARMbfi A, val, mask ConstantSDNode *N1C = dyn_cast(N1); if (N1C) { unsigned Val = N1C->getZExtValue(); if ((Val & ~Mask) != Val) return SDValue(); if (ARM::isBitFieldInvertedMask(Mask)) { Val >>= countTrailingZeros(~Mask); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, DAG.getConstant(Val, DL, MVT::i32), DAG.getConstant(Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } } else if (N1.getOpcode() == ISD::AND) { // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); unsigned Mask2 = N11C->getZExtValue(); // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern // as is to match. if (ARM::isBitFieldInvertedMask(Mask) && (Mask == ~Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasT2ExtractPack() && (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a unsigned amt = countTrailingZeros(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), DAG.getConstant(amt, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, DAG.getConstant(Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } else if (ARM::isBitFieldInvertedMask(~Mask) && (~Mask == Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasT2ExtractPack() && (Mask2 == 0xffff || Mask2 == 0xffff0000)) return SDValue(); // 2b unsigned lsb = countTrailingZeros(Mask); Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, DAG.getConstant(Mask2, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } } if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && N00.getOpcode() == ISD::SHL && isa(N00.getOperand(1)) && ARM::isBitFieldInvertedMask(~Mask)) { // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask // where lsb(mask) == #shamt and masked bits of B are known zero. SDValue ShAmt = N00.getOperand(1); unsigned ShAmtC = cast(ShAmt)->getZExtValue(); unsigned LSB = countTrailingZeros(Mask); if (ShAmtC != LSB) return SDValue(); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), DAG.getConstant(~Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); } return SDValue(); } static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); if (!Subtarget->isThumb1Only()) { // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) SDValue Result = combineSelectAndUseCommutative(N, false, DCI); if (Result.getNode()) return Result; } return SDValue(); } // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and // their position in "to" (Rd). static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { assert(N->getOpcode() == ARMISD::BFI); SDValue From = N->getOperand(1); ToMask = ~cast(N->getOperand(2))->getAPIntValue(); FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); // If the Base came from a SHR #C, we can deduce that it is really testing bit // #C in the base of the SHR. if (From->getOpcode() == ISD::SRL && isa(From->getOperand(1))) { APInt Shift = cast(From->getOperand(1))->getAPIntValue(); assert(Shift.getLimitedValue() < 32 && "Shift too large!"); FromMask <<= Shift.getLimitedValue(31); From = From->getOperand(0); } return From; } // If A and B contain one contiguous set of bits, does A | B == A . B? // // Neither A nor B must be zero. static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { unsigned LastActiveBitInA = A.countTrailingZeros(); unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; return LastActiveBitInA - 1 == FirstActiveBitInB; } static SDValue FindBFIToCombineWith(SDNode *N) { // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, // if one exists. APInt ToMask, FromMask; SDValue From = ParseBFI(N, ToMask, FromMask); SDValue To = N->getOperand(0); // Now check for a compatible BFI to merge with. We can pass through BFIs that // aren't compatible, but not if they set the same bit in their destination as // we do (or that of any BFI we're going to combine with). SDValue V = To; APInt CombinedToMask = ToMask; while (V.getOpcode() == ARMISD::BFI) { APInt NewToMask, NewFromMask; SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); if (NewFrom != From) { // This BFI has a different base. Keep going. CombinedToMask |= NewToMask; V = V.getOperand(0); continue; } // Do the written bits conflict with any we've seen so far? if ((NewToMask & CombinedToMask).getBoolValue()) // Conflicting bits - bail out because going further is unsafe. return SDValue(); // Are the new bits contiguous when combined with the old bits? if (BitsProperlyConcatenate(ToMask, NewToMask) && BitsProperlyConcatenate(FromMask, NewFromMask)) return V; if (BitsProperlyConcatenate(NewToMask, ToMask) && BitsProperlyConcatenate(NewFromMask, FromMask)) return V; // We've seen a write to some bits, so track it. CombinedToMask |= NewToMask; // Keep going... V = V.getOperand(0); } return SDValue(); } static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); if (N1.getOpcode() == ISD::AND) { // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff // the bits being cleared by the AND are not demanded by the BFI. ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); unsigned InvMask = cast(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; assert(Width < static_cast(std::numeric_limits::digits) && "undefined behavior"); unsigned Mask = (1u << Width) - 1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. // Keep track of any consecutive bits set that all come from the same base // value. We can combine these together into a single BFI. SDValue CombineBFI = FindBFIToCombineWith(N); if (CombineBFI == SDValue()) return SDValue(); // We've found a BFI. APInt ToMask1, FromMask1; SDValue From1 = ParseBFI(N, ToMask1, FromMask1); APInt ToMask2, FromMask2; SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); assert(From1 == From2); (void)From2; // First, unlink CombineBFI. DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); // Then create a new BFI, combining the two together. APInt NewFromMask = FromMask1 | FromMask2; APInt NewToMask = ToMask1 | ToMask2; EVT VT = N->getValueType(0); SDLoc dl(N); if (NewFromMask[0] == 0) From1 = DCI.DAG.getNode( ISD::SRL, dl, VT, From1, DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); // vmovrrd(load f64) -> (load i32), (load i32) SDNode *InNode = InDouble.getNode(); if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && InNode->getValueType(0) == MVT::f64 && InNode->getOperand(1).getOpcode() == ISD::FrameIndex && !cast(InNode)->isVolatile()) { // TODO: Should this be done for non-FrameIndex operands? LoadSDNode *LD = cast(InNode); SelectionDAG &DAG = DCI.DAG; SDLoc DL(LD); SDValue BasePtr = LD->getBasePtr(); SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), LD->getAlignment()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), std::min(4U, LD->getAlignment() / 2)); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); if (DCI.DAG.getDataLayout().isBigEndian()) std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); return Result; } return SDValue(); } /// PerformVMOVDRRCombine - Target-specific dag combine xforms for /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() == ISD::BITCAST) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::BITCAST) Op1 = Op1.getOperand(0); if (Op0.getOpcode() == ARMISD::VMOVRRD && Op0.getNode() == Op1.getNode() && Op0.getResNo() == 0 && Op1.getResNo() == 1) return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0.getOperand(0)); return SDValue(); } /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded /// directly into a VFP register. static bool hasNormalLoadOperand(SDNode *N) { unsigned NumElts = N->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { SDNode *Elt = N->getOperand(i).getNode(); if (ISD::isNormalLoad(Elt) && !cast(Elt)->isVolatile()) return true; } return false; } /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for /// ISD::BUILD_VECTOR. static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value // into a pair of GPRs, which is fine when the value is used as a scalar, // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. SelectionDAG &DAG = DCI.DAG; if (N->getNumOperands() == 2) { SDValue RV = PerformVMOVDRRCombine(N, DAG); if (RV.getNode()) return RV; } // Load i64 elements as f64 values so that type legalization does not split // them up into i32 values. EVT VT = N->getValueType(0); if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) return SDValue(); SDLoc dl(N); SmallVector Ops; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); Ops.push_back(V); // Make the DAGCombiner fold the bitcast. DCI.AddToWorklist(V.getNode()); } EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, BV); } /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. // At that time, we may have inserted bitcasts from integer to float. // If these bitcasts have survived DAGCombine, change the lowering of this // BUILD_VECTOR in something more vector friendly, i.e., that does not // force to use floating point types. // Make sure we can change the type of the vector. // This is possible iff: // 1. The vector is only used in a bitcast to a integer type. I.e., // 1.1. Vector is used only once. // 1.2. Use is a bit convert to an integer type. // 2. The size of its operands are 32-bits (64-bits are not legal). EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); // Check 1.1. and 2. if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) return SDValue(); // By construction, the input type must be float. assert(EltVT == MVT::f32 && "Unexpected type!"); // Check 1.2. SDNode *Use = *N->use_begin(); if (Use->getOpcode() != ISD::BITCAST || Use->getValueType(0).isFloatingPoint()) return SDValue(); // Check profitability. // Model is, if more than half of the relevant operands are bitcast from // i32, turn the build_vector into a sequence of insert_vector_elt. // Relevant operands are everything that is not statically // (i.e., at compile time) bitcasted. unsigned NumOfBitCastedElts = 0; unsigned NumElts = VT.getVectorNumElements(); unsigned NumOfRelevantElts = NumElts; for (unsigned Idx = 0; Idx < NumElts; ++Idx) { SDValue Elt = N->getOperand(Idx); if (Elt->getOpcode() == ISD::BITCAST) { // Assume only bit cast to i32 will go away. if (Elt->getOperand(0).getValueType() == MVT::i32) ++NumOfBitCastedElts; } else if (Elt.getOpcode() == ISD::UNDEF || isa(Elt)) // Constants are statically casted, thus do not count them as // relevant operands. --NumOfRelevantElts; } // Check if more than half of the elements require a non-free bitcast. if (NumOfBitCastedElts <= NumOfRelevantElts / 2) return SDValue(); SelectionDAG &DAG = DCI.DAG; // Create the new vector type. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); // Check if the type is legal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VecVT)) return SDValue(); // Combine: // ARMISD::BUILD_VECTOR E1, E2, ..., EN. // => BITCAST INSERT_VECTOR_ELT // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), // (BITCAST EN), N. SDValue Vec = DAG.getUNDEF(VecVT); SDLoc dl(N); for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { SDValue V = N->getOperand(Idx); if (V.getOpcode() == ISD::UNDEF) continue; if (V.getOpcode() == ISD::BITCAST && V->getOperand(0).getValueType() == MVT::i32) // Fold obvious case. V = V.getOperand(0); else { V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(V.getNode()); } SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); } Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); return Vec; } /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Bitcast an i64 load inserted into a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. EVT VT = N->getValueType(0); SDNode *Elt = N->getOperand(1).getNode(); if (VT.getVectorElementType() != MVT::i64 || !ISD::isNormalLoad(Elt) || cast(Elt)->isVolatile()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VT.getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); DCI.AddToWorklist(V.getNode()); SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, Vec, V, N->getOperand(2)); return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { // The LLVM shufflevector instruction does not require the shuffle mask // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the // operands do not match the mask length, they are extended by concatenating // them with undef vectors. That is probably the right thing for other // targets, but for NEON it is better to concatenate two double-register // size vector operands into a single quad-register size vector. Do that // transformation here: // shuffle(concat(v1, undef), concat(v2, undef)) -> // shuffle(concat(v1, v2), undef) SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() != ISD::CONCAT_VECTORS || Op1.getOpcode() != ISD::CONCAT_VECTORS || Op0.getNumOperands() != 2 || Op1.getNumOperands() != 2) return SDValue(); SDValue Concat0Op1 = Op0.getOperand(1); SDValue Concat1Op1 = Op1.getOperand(1); if (Concat0Op1.getOpcode() != ISD::UNDEF || Concat1Op1.getOpcode() != ISD::UNDEF) return SDValue(); // Skip the transformation if any of the types are illegal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(Concat0Op1.getValueType()) || !TLI.isTypeLegal(Concat1Op1.getValueType())) return SDValue(); SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Op0.getOperand(0), Op1.getOperand(0)); // Translate the shuffle mask. SmallVector NewMask; unsigned NumElts = VT.getVectorNumElements(); unsigned HalfElts = NumElts/2; ShuffleVectorSDNode *SVN = cast(N); for (unsigned n = 0; n < NumElts; ++n) { int MaskElt = SVN->getMaskElt(n); int NewElt = -1; if (MaskElt < (int)HalfElts) NewElt = MaskElt; else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) NewElt = HalfElts + MaskElt - NumElts; NewMask.push_back(NewElt); } return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, DAG.getUNDEF(VT), NewMask.data()); } /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, /// NEON load/store intrinsics, and generic vector load/stores, to merge /// base address updates. /// For generic load/stores, the memory type is assumed to be a vector. /// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); const bool isStore = N->getOpcode() == ISD::STORE; const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); MemSDNode *MemN = cast(N); SDLoc dl(N); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) continue; // Find the new opcode for the updating load/store. bool isLoadOp = true; bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; if (isIntrinsic) { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; NumVecs = 1; break; case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; NumVecs = 2; break; case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; NumVecs = 3; break; case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; NumVecs = 4; break; case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; NumVecs = 2; isLaneOp = true; break; case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; NumVecs = 3; isLaneOp = true; break; case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; NumVecs = 4; isLaneOp = true; break; case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; NumVecs = 1; isLoadOp = false; break; case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; NumVecs = 2; isLoadOp = false; break; case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; NumVecs = 3; isLoadOp = false; break; case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; NumVecs = 4; isLoadOp = false; break; case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; NumVecs = 2; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; NumVecs = 3; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; NumVecs = 4; isLoadOp = false; isLaneOp = true; break; } } else { isLaneOp = true; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode for Neon base update"); case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; NumVecs = 1; isLaneOp = false; break; case ISD::STORE: NewOpc = ARMISD::VST1_UPD; NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } } // Find the size of memory referenced by the load/store. EVT VecTy; if (isLoadOp) { VecTy = N->getValueType(0); } else if (isIntrinsic) { VecTy = N->getOperand(AddrOpIdx+1).getValueType(); } else { assert(isStore && "Node has to be a load, a store, or an intrinsic!"); VecTy = N->getOperand(1).getValueType(); } unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint64_t IncVal = CInc->getZExtValue(); if (IncVal != NumBytes) continue; } else if (NumBytes >= 3 * 16) { // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two // separate instructions that make it harder to use a non-constant update. continue; } // OK, we found an ADD we can fold into the base update. // Now, create a _UPD node, taking care of not breaking alignment. EVT AlignedVecTy = VecTy; unsigned Alignment = MemN->getAlignment(); // If this is a less-than-standard-aligned load/store, change the type to // match the standard alignment. // The alignment is overlooked when selecting _UPD variants; and it's // easier to introduce bitcasts here than fix that. // There are 3 ways to get to this base-update combine: // - intrinsics: they are assumed to be properly aligned (to the standard // alignment of the memory type), so we don't need to do anything. // - ARMISD::VLDx nodes: they are only generated from the aforementioned // intrinsics, so, likewise, there's nothing to do. // - generic load/store instructions: the alignment is specified as an // explicit operand, rather than implicitly as the standard alignment // of the memory type (like the intrisics). We need to change the // memory type to match the explicit alignment. That way, we don't // generate non-standard-aligned ARMISD::VLDx nodes. if (isa(N)) { if (Alignment == 0) Alignment = 1; if (Alignment < VecTy.getScalarSizeInBits() / 8) { MVT EltTy = MVT::getIntegerVT(Alignment * 8); assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); assert(!isLaneOp && "Unexpected generic load/store lane."); unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); } // Don't set an explicit alignment on regular load/stores that we want // to transform to VLD/VST 1_UPD nodes. // This matches the behavior of regular load/stores, which only get an // explicit alignment if the MMO alignment is larger than the standard // alignment of the memory type. // Intrinsics, however, always get an explicit alignment, set to the // alignment of the MMO. Alignment = 1; } // Create the new updating load/store node. // First, create an SDVTList for the new updating node's results. EVT Tys[6]; unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); // Then, gather the new node's operands. SmallVector Ops; Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); if (StoreSDNode *StN = dyn_cast(N)) { // Try to match the intrinsic's signature Ops.push_back(StN->getValue()); } else { // Loads (and of course intrinsics) match the intrinsics' signature, // so just add all but the alignment operand. for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) Ops.push_back(N->getOperand(i)); } // For all node types, the alignment operand is always the last one. Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); // If this is a non-standard-aligned STORE, the penultimate operand is the // stored value. Bitcast it to the aligned type. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { SDValue &StVal = Ops[Ops.size()-2]; StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); } SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, AlignedVecTy, MemN->getMemOperand()); // Update the uses. SmallVector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); // If this is an non-standard-aligned LOAD, the first result is the loaded // value. Bitcast it to the expected result type. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { SDValue &LdVal = NewResults[0]; LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); return CombineBaseUpdate(N, DCI); } /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and /// return true. static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); // vldN-dup instructions only support 64-bit vectors for N > 1. if (!VT.is64BitVector()) return false; // Check if the VDUPLANE operand is a vldN-dup intrinsic. SDNode *VLD = N->getOperand(0).getNode(); if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) return false; unsigned NumVecs = 0; unsigned NewOpc = 0; unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); if (IntNo == Intrinsic::arm_neon_vld2lane) { NumVecs = 2; NewOpc = ARMISD::VLD2DUP; } else if (IntNo == Intrinsic::arm_neon_vld3lane) { NumVecs = 3; NewOpc = ARMISD::VLD3DUP; } else if (IntNo == Intrinsic::arm_neon_vld4lane) { NumVecs = 4; NewOpc = ARMISD::VLD4DUP; } else { return false; } // First check that all the vldN-lane uses are VDUPLANEs and that the lane // numbers match the load. unsigned VLDLaneNo = cast(VLD->getOperand(NumVecs+3))->getZExtValue(); for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); UI != UE; ++UI) { // Ignore uses of the chain result. if (UI.getUse().getResNo() == NumVecs) continue; SDNode *User = *UI; if (User->getOpcode() != ARMISD::VDUPLANE || VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) return false; } // Create the vldN-dup node. EVT Tys[5]; unsigned n; for (n = 0; n < NumVecs; ++n) Tys[n] = VT; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast(VLD); SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, VLDMemInt->getMemoryVT(), VLDMemInt->getMemOperand()); // Update the uses. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); UI != UE; ++UI) { unsigned ResNo = UI.getUse().getResNo(); // Ignore uses of the chain result. if (ResNo == NumVecs) continue; SDNode *User = *UI; DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); } // Now the vldN-lane intrinsic is dead except for its chain result. // Update uses of the chain. std::vector VLDDupResults; for (unsigned n = 0; n < NumVecs; ++n) VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); DCI.CombineTo(VLD, VLDDupResults); return true; } /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue Op = N->getOperand(0); // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. if (CombineVLDDUP(N, DCI)) return SDValue(N, 0); // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is // redundant. Ignore bit_converts for now; element sizes are checked below. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) return SDValue(); // Make sure the VMOV element size is not bigger than the VDUPLANE elements. unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); // The canonical VMOV for a zero vector uses a 32-bit element size. unsigned Imm = cast(Op.getOperand(0))->getZExtValue(); unsigned EltBits; if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) EltSize = 8; EVT VT = N->getValueType(0); if (EltSize > VT.getVectorElementType().getSizeInBits()) return SDValue(); return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); // If this is a legal vector load, try to combine it into a VLD1_UPD. if (ISD::isNormalLoad(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); return SDValue(); } /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { StoreSDNode *St = cast(N); if (St->isVolatile()) return SDValue(); // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); if (St->isTruncatingStore() && VT.isVector()) { SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT StVT = St->getMemoryVT(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); unsigned SizeRatio = FromEltSz / ToEltSz; assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDLoc DL(St); SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 : i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec.data()); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) StoreType = Tp; } // Didn't find a legal store type. if (!TLI.isTypeLegal(StoreType)) return SDValue(); // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); SmallVector Chains; SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, TLI.getPointerTy(DAG.getDataLayout())); SDValue BasePtr = St->getBasePtr(); // Perform one or more big stores into memory. unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); for (unsigned I = 0; I < E; I++) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, ShuffWide, DAG.getIntPtrConstant(I, DL)); SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } if (!ISD::isNormalStore(St)) return SDValue(); // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and // ARM stores of arguments in the same cache line. if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; bool isBigEndian = DAG.getDataLayout().isBigEndian(); SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), BasePtr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), OffsetPtr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), std::min(4U, St->getAlignment() / 2)); } if (StVal.getValueType() == MVT::i64 && StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { // Bitcast an i64 store extracted from a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. SelectionDAG &DAG = DCI.DAG; SDLoc dl(StVal); SDValue IntVec = StVal.getOperand(0); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, IntVec.getValueType().getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Vec, StVal.getOperand(1)); dl = SDLoc(N); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); DCI.AddToWorklist(ExtElt.getNode()); DCI.AddToWorklist(V.getNode()); return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment(), St->getAAInfo()); } // If this is a legal vector store, try to combine it into a VST1_UPD. if (ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); return SDValue(); } /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) /// can replace combinations of VMUL and VCVT (floating-point to integer) /// when the VMUL has a constant operand that is a power of 2. /// /// Example (assume d17 = ): /// vmul.f32 d16, d17, d16 /// vcvt.s32.f32 d16, d16 /// becomes: /// vcvt.s32.f32 d16, d16, #3 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions // only support v2i32/v4i32 types. return SDValue(); } BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); if (C == -1 || C == 0 || C > 32) return SDValue(); SDLoc dl(N); bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), DAG.getConstant(C, dl, MVT::i32)); if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; } /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) /// can replace combinations of VCVT (integer to floating-point) and VDIV /// when the VDIV has a constant operand that is a power of 2. /// /// Example (assume d17 = ): /// vcvt.f32.s32 d16, d16 /// vdiv.f32 d16, d17, d16 /// becomes: /// vcvt.f32.s32 d16, d16, #3 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); if (!N->getValueType(0).isVector() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions // only support v2i32/v4i32 types. return SDValue(); } BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); if (C == -1 || C == 0 || C > 32) return SDValue(); SDLoc dl(N); bool isSigned = OpOpcode == ISD::SINT_TO_FP; SDValue ConvInput = Op.getOperand(0); if (IntBits < FloatBits) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : Intrinsic::arm_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), ConvInput, DAG.getConstant(C, dl, MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. For a shift opcode, the value /// is positive, but for an intrinsic the value count must be negative. The /// absolute value must be in the range: /// 1 <= |Value| <= ElementBits for a right shift; or /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; if (!isIntrinsic) return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; return true; } return false; } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: // Don't do anything for most intrinsics. break; // Vector shifts: check for immediate versions and lower them. // Note: This is done during DAG combining instead of DAG legalizing because // the build_vectors for 64-bit vector element shift counts are generally // not legal, and it is hard to see their values after they get legalized to // loads from a constant pool. case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshifts: case Intrinsic::arm_neon_vqshiftu: case Intrinsic::arm_neon_vqshiftsu: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: case Intrinsic::arm_neon_vqshiftnsu: case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: { EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; switch (IntNo) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { VShiftOpc = ARMISD::VSHL; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRs : ARMISD::VSHRu); break; } return SDValue(); case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) break; return SDValue(); case Intrinsic::arm_neon_vqshifts: case Intrinsic::arm_neon_vqshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; return SDValue(); case Intrinsic::arm_neon_vqshiftsu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; llvm_unreachable("invalid shift count for vqshlu intrinsic"); case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: case Intrinsic::arm_neon_vqshiftnsu: case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; llvm_unreachable("invalid shift count for narrowing vector shift " "intrinsic"); default: llvm_unreachable("unhandled vector shift"); } switch (IntNo) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: VShiftOpc = ARMISD::VRSHRs; break; case Intrinsic::arm_neon_vrshiftu: VShiftOpc = ARMISD::VRSHRu; break; case Intrinsic::arm_neon_vrshiftn: VShiftOpc = ARMISD::VRSHRN; break; case Intrinsic::arm_neon_vqshifts: VShiftOpc = ARMISD::VQSHLs; break; case Intrinsic::arm_neon_vqshiftu: VShiftOpc = ARMISD::VQSHLu; break; case Intrinsic::arm_neon_vqshiftsu: VShiftOpc = ARMISD::VQSHLsu; break; case Intrinsic::arm_neon_vqshiftns: VShiftOpc = ARMISD::VQSHRNs; break; case Intrinsic::arm_neon_vqshiftnu: VShiftOpc = ARMISD::VQSHRNu; break; case Intrinsic::arm_neon_vqshiftnsu: VShiftOpc = ARMISD::VQSHRNsu; break; case Intrinsic::arm_neon_vqrshiftns: VShiftOpc = ARMISD::VQRSHRNs; break; case Intrinsic::arm_neon_vqrshiftnu: VShiftOpc = ARMISD::VQRSHRNu; break; case Intrinsic::arm_neon_vqrshiftnsu: VShiftOpc = ARMISD::VQRSHRNsu; break; } SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vshiftins: { EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) VShiftOpc = ARMISD::VSLI; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) VShiftOpc = ARMISD::VSRI; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), N->getOperand(2), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vqrshifts: case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; } return SDValue(); } /// PerformShiftCombine - Checks for immediate versions of vector shifts and /// lowers them. As with the vector shift intrinsics, this is done during DAG /// combining instead of DAG legalizing because the build_vectors for 64-bit /// vector element shift counts are generally not legal, and it is hard to see /// their values after they get legalized to loads from a constant pool. static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. SDValue N1 = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(N1)) { SDValue N0 = N->getOperand(0); if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && DAG.MaskedValueIsZero(N0.getOperand(0), APInt::getHighBitsSet(32, 16))) return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); } } // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); assert(ST->hasNEON() && "unexpected vector shift"); int64_t Cnt; switch (N->getOpcode()) { default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRs : ARMISD::VSHRu); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } } return SDValue(); } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue N0 = N->getOperand(0); // Check for sign- and zero-extensions of vector extract operations of 8- // and 16-bit vector elements. NEON supports these directly. They are // handled during DAG combining because type legalization will promote them // to 32-bit types and it is messy to recognize the operations after that. if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue Vec = N0.getOperand(0); SDValue Lane = N0.getOperand(1); EVT VT = N->getValueType(0); EVT EltVT = N0.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (VT == MVT::i32 && (EltVT == MVT::i8 || EltVT == MVT::i16) && TLI.isTypeLegal(Vec.getValueType()) && isa(Lane)) { unsigned Opc = 0; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode"); case ISD::SIGN_EXTEND: Opc = ARMISD::VGETLANEs; break; case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Opc = ARMISD::VGETLANEu; break; } return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); } } return SDValue(); } static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, APInt &KnownOne) { if (Op.getOpcode() == ARMISD::BFI) { // Conservatively, we can recurse down the first operand // and just mask out all affected bits. computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); // The operand to BFI is already a mask suitable for removing the bits it // sets. ConstantSDNode *CI = cast(Op.getOperand(2)); APInt Mask = CI->getAPIntValue(); KnownZero &= Mask; KnownOne &= Mask; return; } if (Op.getOpcode() == ARMISD::CMOV) { APInt KZ2(KnownZero.getBitWidth(), 0); APInt KO2(KnownOne.getBitWidth(), 0); computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); KnownZero &= KZ2; KnownOne &= KO2; return; } return DAG.computeKnownBits(Op, KnownZero, KnownOne); } SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { // If we have a CMOV, OR and AND combination such as: // if (x & CN) // y |= CM; // // And: // * CN is a single bit; // * All bits covered by CM are known zero in y // // Then we can convert this into a sequence of BFI instructions. This will // always be a win if CM is a single bit, will always be no worse than the // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is // three bits (due to the extra IT instruction). SDValue Op0 = CMOV->getOperand(0); SDValue Op1 = CMOV->getOperand(1); auto CCNode = cast(CMOV->getOperand(2)); auto CC = CCNode->getAPIntValue().getLimitedValue(); SDValue CmpZ = CMOV->getOperand(4); // The compare must be against zero. if (!isNullConstant(CmpZ->getOperand(1))) return SDValue(); assert(CmpZ->getOpcode() == ARMISD::CMPZ); SDValue And = CmpZ->getOperand(0); if (And->getOpcode() != ISD::AND) return SDValue(); ConstantSDNode *AndC = dyn_cast(And->getOperand(1)); if (!AndC || !AndC->getAPIntValue().isPowerOf2()) return SDValue(); SDValue X = And->getOperand(0); if (CC == ARMCC::EQ) { // We're performing an "equal to zero" compare. Swap the operands so we // canonicalize on a "not equal to zero" compare. std::swap(Op0, Op1); } else { assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } if (Op1->getOpcode() != ISD::OR) return SDValue(); ConstantSDNode *OrC = dyn_cast(Op1->getOperand(1)); if (!OrC) return SDValue(); SDValue Y = Op1->getOperand(0); if (Op0 != Y) return SDValue(); // Now, is it profitable to continue? APInt OrCI = OrC->getAPIntValue(); unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; if (OrCI.countPopulation() > Heuristic) return SDValue(); // Lastly, can we determine that the bits defined by OrCI // are zero in Y? APInt KnownZero, KnownOne; computeKnownBits(DAG, Y, KnownZero, KnownOne); if ((OrCI & KnownZero) != OrCI) return SDValue(); // OK, we can do the combine. SDValue V = Y; SDLoc dl(X); EVT VT = X.getValueType(); unsigned BitInX = AndC->getAPIntValue().logBase2(); if (BitInX != 0) { // We must shift X first. X = DAG.getNode(ISD::SRL, dl, VT, X, DAG.getConstant(BitInX, dl, VT)); } for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); BitInY < NumActiveBits; ++BitInY) { if (OrCI[BitInY] == 0) continue; APInt Mask(VT.getSizeInBits(), 0); Mask.setBit(BitInY); V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, // Confusingly, the operand is an *inverted* mask. DAG.getConstant(~Mask, dl, VT)); } return V; } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. SDValue ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { SDValue Cmp = N->getOperand(4); if (Cmp.getOpcode() != ARMISD::CMPZ) // Only looking at EQ and NE cases. return SDValue(); EVT VT = N->getValueType(0); SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue FalseVal = N->getOperand(0); SDValue TrueVal = N->getOperand(1); SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); // BFI is only available on V6T2+. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { SDValue R = PerformCMOVToBFICombine(N, DAG); if (R) return R; } // Simplify // mov r1, r0 // cmp r1, x // mov r0, y // moveq r0, x // to // cmp r0, x // movne r0, y // // mov r1, r0 // cmp r1, x // mov r0, x // movne r0, y // to // cmp r0, x // movne r0, y /// FIXME: Turn this into a target neutral optimization? SDValue Res; if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, N->getOperand(3), Cmp); } else if (CC == ARMCC::EQ && TrueVal == RHS) { SDValue ARMcc; SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, N->getOperand(3), NewCmp); } if (Res.getNode()) { APInt KnownZero, KnownOne; DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); // Capture demanded bits information that would be otherwise lost. if (KnownZero == 0xfffffffe) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i1)); else if (KnownZero == 0xffffff00) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i8)); else if (KnownZero == 0xffff0000) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i16)); } return Res; } SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); case ISD::STORE: return PerformSTORECombine(N, DCI); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); default: break; } break; } return SDValue(); } bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const { return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i8: case MVT::i16: case MVT::i32: { // Unaligned access can use (for example) LRDB, LRDH, LDR if (AllowsUnaligned) { if (Fast) *Fast = Subtarget->hasV7Ops(); return true; } return false; } case MVT::f64: case MVT::v2f64: { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) *Fast = true; return true; } return false; } } } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && (DstAlign == 0 || DstAlign % AlignCheck == 0)); } EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); // See if we can use NEON instructions for this... if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && Fast))) { return MVT::f64; } } // Lowering to i32/i16 if the size permits. if (Size >= 4) return MVT::i32; else if (Size >= 2) return MVT::i16; // Let the target-independent logic figure it out. return MVT::Other; } bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { if (Val.getOpcode() != ISD::LOAD) return false; EVT VT1 = Val.getValueType(); if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i1: case MVT::i8: case MVT::i16: // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. return true; } return false; } bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); if (!isTypeLegal(VT)) return false; // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no // matter what. There can be two uses by the same instruction. if (ExtVal->use_empty() || !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) return true; SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) return false; return true; } bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; unsigned Scale = 1; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: // Scale == 1; break; case MVT::i16: // Scale == 2; Scale = 2; break; case MVT::i32: // Scale == 4; Scale = 4; break; } if ((V & (Scale - 1)) != 0) return false; V /= Scale; return V == (V & ((1LL << 5) - 1)); } static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { bool isNeg = false; if (V < 0) { isNeg = true; V = - V; } switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i16: case MVT::i32: // + imm12 or - imm8 if (isNeg) return V == (V & ((1LL << 8) - 1)); return V == (V & ((1LL << 12) - 1)); case MVT::f32: case MVT::f64: // Same as ARM mode. FIXME: NEON? if (!Subtarget->hasVFP2()) return false; if ((V & 3) != 0) return false; V >>= 2; return V == (V & ((1LL << 8) - 1)); } } /// isLegalAddressImmediate - Return true if the integer value can be used /// as the offset of the target addressing mode for load / store of the /// given type. static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { if (V == 0) return true; if (!VT.isSimple()) return false; if (Subtarget->isThumb1Only()) return isLegalT1AddressImmediate(V, VT); else if (Subtarget->isThumb2()) return isLegalT2AddressImmediate(V, VT, Subtarget); // ARM mode. if (V < 0) V = - V; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: // +- imm12 return V == (V & ((1LL << 12) - 1)); case MVT::i16: // +- imm8 return V == (V & ((1LL << 8) - 1)); case MVT::f32: case MVT::f64: if (!Subtarget->hasVFP2()) // FIXME: NEON? return false; if ((V & 3) != 0) return false; V >>= 2; return V == (V & ((1LL << 8) - 1)); } } bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const { int Scale = AM.Scale; if (Scale < 0) return false; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i16: case MVT::i32: if (Scale == 1) return true; // r + r << imm Scale = Scale & ~1; return Scale == 2 || Scale == 4 || Scale == 8; case MVT::i64: // r + r if (((unsigned)AM.HasBaseReg + Scale) <= 2) return true; return false; case MVT::isVoid: // Note, we allow "void" uses (basically, uses that aren't loads or // stores), because arm allows folding a scale into many arithmetic // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. if (Scale & 1) return false; return isPowerOf2_32(Scale); } } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { EVT VT = getValueType(DL, Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; // Can never fold addr of global into load/store. if (AM.BaseGV) return false; switch (AM.Scale) { case 0: // no scale reg, must be "r+i" or "r", or "i". break; case 1: if (Subtarget->isThumb1Only()) return false; // FALL THROUGH. default: // ARM doesn't support any R+R*scale+imm addr modes. if (AM.BaseOffs) return false; if (!VT.isSimple()) return false; if (Subtarget->isThumb2()) return isLegalT2ScaledAddressingMode(AM, VT); int Scale = AM.Scale; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: if (Scale < 0) Scale = -Scale; if (Scale == 1) return true; // r + r << imm return isPowerOf2_32(Scale & ~1); case MVT::i16: case MVT::i64: // r + r if (((unsigned)AM.HasBaseReg + Scale) <= 2) return true; return false; case MVT::isVoid: // Note, we allow "void" uses (basically, uses that aren't loads or // stores), because arm allows folding a scale into many arithmetic // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. if (Scale & 1) return false; return isPowerOf2_32(Scale); } } return true; } /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // Thumb2 and ARM modes can use cmn for negative immediates. if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; // Thumb1 doesn't have cmn, and only 8-bit immediates. return Imm >= 0 && Imm <= 255; } /// isLegalAddImmediate - Return true if the specified immediate is a legal add /// *or sub* immediate, that is the target has add or sub instructions which can /// add a register with the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { // Same encoding for add/sub, just flip the sign. int64_t AbsImm = std::abs(Imm); if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(AbsImm) != -1; if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal(AbsImm) != -1; // Thumb1 only has 8-bit unsigned immediate. return AbsImm >= 0 && AbsImm <= 255; } static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { // AddressingMode 3 Base = Ptr->getOperand(0); if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -256) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } isInc = (Ptr->getOpcode() == ISD::ADD); Offset = Ptr->getOperand(1); return true; } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { // AddressingMode 2 if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -0x1000) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); Base = Ptr->getOperand(0); return true; } } if (Ptr->getOpcode() == ISD::ADD) { isInc = true; ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); if (ShOpcVal != ARM_AM::no_shift) { Base = Ptr->getOperand(1); Offset = Ptr->getOperand(0); } else { Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); } return true; } isInc = (Ptr->getOpcode() == ISD::ADD); Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); return true; } // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. return false; } static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; Base = Ptr->getOperand(0); if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -0x100) { // 8 bits. assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. isInc = Ptr->getOpcode() == ISD::ADD; Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } return false; } /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { if (Subtarget->isThumb1Only()) return false; EVT VT; SDValue Ptr; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); } else return false; bool isInc; bool isLegal = false; if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); else isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) return false; AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; return true; } /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { if (Subtarget->isThumb1Only()) return false; EVT VT; SDValue Ptr; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool isInc; bool isLegal = false; if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); else isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) return false; if (Ptr != Base) { // Swap base ptr and offset to catch more post-index load / store when // it's legal. In Thumb2 mode, offset must be an immediate. if (Ptr == Offset && Op->getOpcode() == ISD::ADD && !Subtarget->isThumb2()) std::swap(Base, Offset); // Post-indexed load / store update the base pointer. if (Ptr != Base) return false; } AM = isInc ? ISD::POST_INC : ISD::POST_DEC; return true; } void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = KnownOne.getBitWidth(); KnownZero = KnownOne = APInt(BitWidth, 0); switch (Op.getOpcode()) { default: break; case ARMISD::ADDC: case ARMISD::ADDE: case ARMISD::SUBC: case ARMISD::SUBE: // These nodes' second result is a boolean if (Op.getResNo() == 0) break; KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); if (KnownZero == 0 && KnownOne == 0) return; APInt KnownZeroRHS, KnownOneRHS; DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); KnownZero &= KnownZeroRHS; KnownOne &= KnownOneRHS; return; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarType().getSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } } } } //===----------------------------------------------------------------------===// // ARM Inline Assembly Support //===----------------------------------------------------------------------===// bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { // Looking for "rev" which is V6+. if (!Subtarget->hasV6Ops()) return false; InlineAsm *IA = cast(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: AsmStr = AsmPieces[0]; AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t,"); // rev $0, $1 if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && IA->getConstraintString().compare(0, 4, "=l,l") == 0) { IntegerType *Ty = dyn_cast(CI->getType()); if (Ty && Ty->getBitWidth() == 32) return IntrinsicLowering::LowerToByteSwap(CI); } break; } return false; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType ARMTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'l': return C_RegisterClass; case 'w': return C_RegisterClass; case 'h': return C_RegisterClass; case 'x': return C_RegisterClass; case 't': return C_RegisterClass; case 'j': return C_Other; // Constant for movw. // An address with a single base register. Due to the way we // currently handle addresses it is the same as an 'r' memory constraint. case 'Q': return C_Memory; } } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; // All 'U+' constraints are addresses. case 'U': return C_Memory; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight ARMTargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'l': if (type->isIntegerTy()) { if (Subtarget->isThumb()) weight = CW_SpecificReg; else weight = CW_Register; } break; case 'w': if (type->isFloatingPointTy()) weight = CW_Register; break; } return weight; } typedef std::pair RCPair; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters switch (Constraint[0]) { case 'l': // Low regs or general regs. if (Subtarget->isThumb()) return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'h': // High regs or no regs. if (Subtarget->isThumb()) return RCPair(0U, &ARM::hGPRRegClass); break; case 'r': if (Subtarget->isThumb1Only()) return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'w': if (VT == MVT::Other) break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPRRegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPRRegClass); break; case 'x': if (VT == MVT::Other) break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPR_8RegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPR_8RegClass); break; case 't': if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'j': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': ConstantSDNode *C = dyn_cast(Op); if (!C) return; int64_t CVal64 = C->getSExtValue(); int CVal = (int) CVal64; // None of these constraints allow values larger than 32 bits. Check // that the value fits in an int. if (CVal != CVal64) return; switch (ConstraintLetter) { case 'j': // Constant suitable for movw, must be between 0 and // 65535. if (Subtarget->hasV6T2Ops()) if (CVal >= 0 && CVal <= 65535) break; return; case 'I': if (Subtarget->isThumb1Only()) { // This must be a constant between 0 and 255, for ADD // immediates. if (CVal >= 0 && CVal <= 255) break; } else if (Subtarget->isThumb2()) { // A constant that can be used as an immediate value in a // data-processing instruction. if (ARM_AM::getT2SOImmVal(CVal) != -1) break; } else { // A constant that can be used as an immediate value in a // data-processing instruction. if (ARM_AM::getSOImmVal(CVal) != -1) break; } return; case 'J': if (Subtarget->isThumb1Only()) { // This must be a constant between -255 and -1, for negated ADD // immediates. This can be used in GCC with an "n" modifier that // prints the negated value, for use with SUB instructions. It is // not useful otherwise but is implemented for compatibility. if (CVal >= -255 && CVal <= -1) break; } else { // This must be a constant between -4095 and 4095. It is not clear // what this constraint is intended for. Implemented for // compatibility with GCC. if (CVal >= -4095 && CVal <= 4095) break; } return; case 'K': if (Subtarget->isThumb1Only()) { // A 32-bit value where only one byte has a nonzero value. Exclude // zero to match GCC. This constraint is used by GCC internally for // constants that can be loaded with a move/shift combination. // It is not useful otherwise but is implemented for compatibility. if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) break; } else if (Subtarget->isThumb2()) { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC // with a "B" modifier that prints the inverted value, for use with // BIC and MVN instructions. It is not useful otherwise but is // implemented for compatibility. if (ARM_AM::getT2SOImmVal(~CVal) != -1) break; } else { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC // with a "B" modifier that prints the inverted value, for use with // BIC and MVN instructions. It is not useful otherwise but is // implemented for compatibility. if (ARM_AM::getSOImmVal(~CVal) != -1) break; } return; case 'L': if (Subtarget->isThumb1Only()) { // This must be a constant between -7 and 7, // for 3-operand ADD/SUB immediate instructions. if (CVal >= -7 && CVal < 7) break; } else if (Subtarget->isThumb2()) { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" // modifier that prints the negated value, for use with SUB // instructions. It is not useful otherwise but is implemented for // compatibility. if (ARM_AM::getT2SOImmVal(-CVal) != -1) break; } else { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" // modifier that prints the negated value, for use with SUB // instructions. It is not useful otherwise but is implemented for // compatibility. if (ARM_AM::getSOImmVal(-CVal) != -1) break; } return; case 'M': if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between 0 and 1020, for // ADD sp + immediate. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) break; } else { // A power of two or a constant between 0 and 32. This is used in // GCC for the shift amount on shifted register operands, but it is // useful in general for any shift amounts. if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) break; } return; case 'N': if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a constant between 0 and 31, for shift amounts. if (CVal >= 0 && CVal <= 31) break; } return; case 'O': if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a multiple of 4 between -508 and 508, for // ADD/SUB sp = sp + immediate. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) break; } return; } Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } static RTLIB::Libcall getDivRemLibcall( const SDNode *N, MVT::SimpleValueType SVT) { assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && "Unhandled Opcode in getDivRemLibcall"); bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; RTLIB::Libcall LC; switch (SVT) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; } return LC; } static TargetLowering::ArgListTy getDivRemArgList( const SDNode *N, LLVMContext *Context) { assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && "Unhandled Opcode in getDivRemArgList"); bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { EVT ArgVT = N->getOperand(i).getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*Context); Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = isSigned; Entry.isZExt = !isSigned; Args.push_back(Entry); } return Args; } SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && "Register-based DivRem lowering only"); unsigned Opcode = Op->getOpcode(); assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && "Invalid opcode for Div/Rem lowering"); bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), VT.getSimpleVT().SimpleTy); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), DAG.getContext()); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return CallInfo.first; } // Lowers REM using divmod helpers // see RTABI section 4.2/4.3 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { // Build return types (div and rem) std::vector RetTyParams; Type *RetTyElement; switch (N->getValueType(0).getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; } RetTyParams.push_back(RetTyElement); RetTyParams.push_back(RetTyElement); ArrayRef ret = ArrayRef(RetTyParams); Type *RetTy = StructType::get(*DAG.getContext(), ret); RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). SimpleTy); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); bool isSigned = N->getOpcode() == ISD::SREM; SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); // Lower call CallLoweringInfo CLI(DAG); CLI.setChain(InChain) .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); std::pair CallResult = LowerCallTo(CLI); // Return second (rem) result operand (first contains div) SDNode *ResNode = CallResult.first.getNode(); assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); return ResNode->getOperand(1); } SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); SDLoc DL(Op); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, DAG.getConstant(2, DL, MVT::i32)); SDValue Flag; Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = NewSP.getValue(1); SDValue Ops[2] = { NewSP, Chain }; return DAG.getMergeValues(Ops, DL); } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && "Unexpected type for custom-lowering FP_EXTEND"); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOperand(0).getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && "Unexpected type for custom-lowering FP_ROUND"); RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. return false; } bool ARM::isBitFieldInvertedMask(unsigned v) { if (v == 0xffffffff) return false; // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's return isShiftedMask_32(~v); } /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { if (!Subtarget->hasVFP3()) return false; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) return ARM_AM::getFP64Imm(Imm) != -1; return false; } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); Info.align = cast(AlignArg)->getZExtValue(); Info.vol = false; // volatile loads with NEON intrinsics not supported Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); Info.align = cast(AlignArg)->getZExtValue(); Info.vol = false; // volatile stores with NEON intrinsics not supported Info.readMem = false; Info.writeMem = true; return true; } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; return true; } case Intrinsic::arm_stlexd: case Intrinsic::arm_strexd: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = 8; Info.vol = true; Info.readMem = false; Info.writeMem = true; return true; } case Intrinsic::arm_ldaexd: case Intrinsic::arm_ldrexd: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 8; Info.vol = true; Info.readMem = true; Info.writeMem = false; return true; } default: break; } return false; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned Bits = Ty->getPrimitiveSizeInBits(); if (Bits == 0 || Bits > 32) return false; return true; } Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); // First, if the target has no DMB, see what fallback we can use. if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; return Builder.CreateCall(MCR, args); } else { // Instead of using barriers, atomic accesses on these subtargets use // libcalls. llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); return Builder.CreateCall(DMB, CDomain); } } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { if (!getInsertFencesForAtomic()) return nullptr; switch (Ord) { case NotAtomic: case Unordered: llvm_unreachable("Invalid fence: unordered/non-atomic"); case Monotonic: case Acquire: return nullptr; // Nothing to do case SequentiallyConsistent: if (!IsStore) return nullptr; // Nothing to do /*FALLTHROUGH*/ case Release: case AcquireRelease: if (Subtarget->isSwift()) return makeDMB(Builder, ARM_MB::ISHST); // FIXME: add a comment with a link to documentation justifying this. else return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { if (!getInsertFencesForAtomic()) return nullptr; switch (Ord) { case NotAtomic: case Unordered: llvm_unreachable("Invalid fence: unordered/not-atomic"); case Monotonic: case Release: return nullptr; // Nothing to do case Acquire: case AcquireRelease: case SequentiallyConsistent: return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitTrailingFence"); } // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); return (Size == 64) && !Subtarget->isMClass(); } // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that // guarantee, see DDI0406C ARM architecture reference manual, // sections A8.8.72-74 LDRD) TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); return (Size <= (Subtarget->isMClass() ? 32U : 64U)) ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { return true; } // This has so far only been implemented for MachO. bool ARMTargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO(); } bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { // If we do not have NEON, vector types are not natively supported. if (!Subtarget->hasNEON()) return false; // Floating point values and vector values map to the same register file. // Therefore, although we could do a store extract of a vector type, this is // better to leave at float as we have more freedom in the addressing mode for // those. if (VectorTy->isFPOrFPVectorTy()) return false; // If the index is unknown at compile time, this is very expensive to lower // and it is not possible to combine the store with the extract. if (!isa(Idx)) return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); unsigned BitWidth = cast(VectorTy)->getBitWidth(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { Cost = 0; return true; } return false; } bool ARMTargetLowering::isCheapToSpeculateCttz() const { return Subtarget->hasV6T2Ops(); } bool ARMTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasV6T2Ops(); } Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); bool IsAcquire = isAtLeastAcquire(Ord); // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i32, i32} and we have to recombine them into a // single i64 here. if (ValTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldrex, Addr), cast(Addr->getType())->getElementType()); } void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); } Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isAtLeastRelease(Ord); // Since the intrinsics must have legal type, the i64 intrinsics take two // parameters: "i32, i32". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; Function *Strex = Intrinsic::getDeclaration(M, Int); Type *Int32Ty = Type::getInt32Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Strex, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; Type *Tys[] = { Addr->getType() }; Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateCall( Strex, {Builder.CreateZExtOrBitCast( Val, Strex->getFunctionType()->getParamType(0)), Addr}); } /// \brief Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); VectorType *VecTy = Shuffles[0]->getType(); Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); unsigned VecSize = DL.getTypeSizeInBits(VecTy); bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; // Skip if we do not have NEON and skip illegal vector types and vector types // with i64/f64 elements (vldN doesn't support i64/f64 elements). if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; IRBuilder<> Builder(LI); SmallVector Ops; Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); Type *Tys[] = { VecTy, Int8Ptr }; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SV = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(VldN, Index); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); SV->replaceAllUsesWith(SubVec); } return true; } /// \brief Get a mask consisting of sequential integers starting from \p Start. /// /// I.e. static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, unsigned NumElts) { SmallVector Mask; for (unsigned i = 0; i < NumElts; i++) Mask.push_back(Builder.getInt32(Start + i)); return ConstantVector::get(Mask); } /// \brief Lower an interleaved store into a vstN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vst3 instruction in CodeGen. bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); VectorType *VecTy = SVI->getType(); assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; // Skip if we do not have NEON and skip illegal vector types and vector types // with i64/f64 elements (vstN doesn't support i64/f64 elements). if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits) return false; Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. Type *IntVecTy = VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = VectorType::get(IntTy, NumSubElts); } static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; SmallVector Ops; Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); Type *Tys[] = { Int8Ptr, SubVecTy }; Function *VstNFunc = Intrinsic::getDeclaration( SI->getModule(), StoreInts[Factor - 2], Tys); // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); return true; } enum HABaseType { HA_UNKNOWN = 0, HA_FLOAT, HA_DOUBLE, HA_VECT64, HA_VECT128 }; static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members) { if (auto *ST = dyn_cast(Ty)) { for (unsigned i = 0; i < ST->getNumElements(); ++i) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) return false; Members += SubMembers; } } else if (auto *AT = dyn_cast(Ty)) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) return false; Members += SubMembers * AT->getNumElements(); } else if (Ty->isFloatTy()) { if (Base != HA_UNKNOWN && Base != HA_FLOAT) return false; Members = 1; Base = HA_FLOAT; } else if (Ty->isDoubleTy()) { if (Base != HA_UNKNOWN && Base != HA_DOUBLE) return false; Members = 1; Base = HA_DOUBLE; } else if (auto *VT = dyn_cast(Ty)) { Members = 1; switch (Base) { case HA_FLOAT: case HA_DOUBLE: return false; case HA_VECT64: return VT->getBitWidth() == 64; case HA_VECT128: return VT->getBitWidth() == 128; case HA_UNKNOWN: switch (VT->getBitWidth()) { case 64: Base = HA_VECT64; return true; case 128: Base = HA_VECT128; return true; default: return false; } } } return (Members > 0 && Members <= 4); } /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when /// passing according to AAPCS rules. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { if (getEffectiveCallingConv(CallConv, isVarArg) != CallingConv::ARM_AAPCS_VFP) return false; HABaseType Base = HA_UNKNOWN; uint64_t Members = 0; bool IsHA = isHomogeneousAggregate(Ty, Base, Members); DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); return IsHA || IsIntArray; } unsigned ARMTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; } unsigned ARMTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; } void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in ARMFunctionInfo. ARMFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void ARMTargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (ARM::GPRRegClass.contains(*I)) RC = &ARM::GPRRegClass; else if (ARM::DPRRegClass.contains(*I)) RC = &ARM::DPRRegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); - BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - NewVR) + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) - BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - *I) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } Index: projects/clang380-import/contrib/llvm/lib/Target/X86/X86CallingConv.td =================================================================== --- projects/clang380-import/contrib/llvm/lib/Target/X86/X86CallingConv.td (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Target/X86/X86CallingConv.td (revision 294609) @@ -1,887 +1,887 @@ //===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This describes the calling conventions for the X86-32 and X86-64 // architectures. // //===----------------------------------------------------------------------===// /// CCIfSubtarget - Match if the current subtarget has a feature F. class CCIfSubtarget : CCIf" "(State.getMachineFunction().getSubtarget()).", F), A>; //===----------------------------------------------------------------------===// // Return Value Calling Conventions //===----------------------------------------------------------------------===// // Return-value conventions common to all X86 CC's. def RetCC_X86Common : CallingConv<[ // Scalar values are returned in AX first, then DX. For i8, the ABI // requires the values to be in AL and AH, however this code uses AL and DL // instead. This is because using AH for the second register conflicts with // the way LLVM does multiple return values -- a return of {i16,i8} would end // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI // for functions that return two i8 values are currently expected to pack the // values into an i16 (which uses AX, and thus AL:AH). // // For code that doesn't care about the ABI, we allow returning more than two // integer values in registers. CCIfType<[i1], CCPromoteToType>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>, // Boolean vectors of AVX-512 are returned in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. CCIfType<[v2i1], CCPromoteToType>, CCIfType<[v4i1], CCPromoteToType>, CCIfType<[v8i1], CCPromoteToType>, CCIfType<[v16i1], CCPromoteToType>, CCIfType<[v32i1], CCPromoteToType>, CCIfType<[v64i1], CCPromoteToType>, // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX target feature. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, // Long double types are always returned in FP0 (even with SSE). CCIfType<[f80], CCAssignToReg<[FP0, FP1]>> ]>; // X86-32 C return-value convention. def RetCC_X86_32_C : CallingConv<[ // The X86-32 calling convention returns FP values in FP0, unless marked // with "inreg" (used here to distinguish one kind of reg from another, // weirdly; this is really the sse-regparm calling convention) in which // case they use XMM0, otherwise it is the same as the common X86 calling // conv. CCIfInReg>>>, CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, CCDelegateTo ]>; // X86-32 FastCC return-value convention. def RetCC_X86_32_Fast : CallingConv<[ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has // SSE2. // This can happen when a float, 2 x float, or 3 x float vector is split by // target lowering, and is returned in 1-3 sse regs. CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, // For integers, ECX can be used as an extra return register CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, // Otherwise, it is the same as the common X86 calling convention. CCDelegateTo ]>; // Intel_OCL_BI return-value convention. def RetCC_Intel_OCL_BI : CallingConv<[ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit FP vectors // No more than 4 registers CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit FP vectors CCIfType<[v16f32, v8f64, v16i32, v8i64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // i32, i64 in the standard way CCDelegateTo ]>; // X86-32 HiPE return-value convention. def RetCC_X86_32_HiPE : CallingConv<[ // Promote all types to i32 CCIfType<[i8, i16], CCPromoteToType>, // Return: HP, P, VAL1, VAL2 CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; // X86-32 HiPE return-value convention. def RetCC_X86_32_VectorCall : CallingConv<[ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit FP vectors CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit FP vectors CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // Return integers in the standard way. CCDelegateTo ]>; // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, // MMX vector types are always returned in XMM0. CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, CCDelegateTo ]>; // X86-Win64 C return-value convention. def RetCC_X86_Win64_C : CallingConv<[ // The X86-Win64 calling convention always returns __m64 values in RAX. CCIfType<[x86mmx], CCBitConvertToType>, // Otherwise, everything is the same as 'normal' X86-64 C CC. CCDelegateTo ]>; // X86-64 HiPE return-value convention. def RetCC_X86_64_HiPE : CallingConv<[ // Promote all types to i64 CCIfType<[i8, i16, i32], CCPromoteToType>, // Return: HP, P, VAL1, VAL2 CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>> ]>; // X86-64 WebKit_JS return-value convention. def RetCC_X86_64_WebKit_JS : CallingConv<[ // Promote all types to i64 CCIfType<[i8, i16, i32], CCPromoteToType>, // Return: RAX CCIfType<[i64], CCAssignToReg<[RAX]>> ]>; // X86-64 AnyReg return-value convention. No explicit register is specified for // the return-value. The register allocator is allowed and expected to choose // any free register. // // This calling convention is currently only supported by the stackmap and // patchpoint intrinsics. All other uses will result in an assert on Debug // builds. On Release builds we fallback to the X86 C calling convention. def RetCC_X86_64_AnyReg : CallingConv<[ CCCustom<"CC_X86_AnyReg_Error"> ]>; // X86-64 HHVM return-value convention. def RetCC_X86_64_HHVM: CallingConv<[ // Promote all types to i64 CCIfType<[i8, i16, i32], CCPromoteToType>, // Return: could return in any GP register save RSP and R12. CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9, RAX, R10, R11, R13, R14, R15]>> ]>; // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. CCIfCC<"CallingConv::Fast", CCDelegateTo>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, // Otherwise, use RetCC_X86_32_C. CCDelegateTo ]>; // This is the root return-value convention for the X86-64 backend. def RetCC_X86_64 : CallingConv<[ // HiPE uses RetCC_X86_64_HiPE CCIfCC<"CallingConv::HiPE", CCDelegateTo>, // Handle JavaScript calls. CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo>, CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, // Handle explicit CC selection CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, // Handle HHVM calls. CCIfCC<"CallingConv::HHVM", CCDelegateTo>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, // Otherwise, drop to normal X86-64 CC CCDelegateTo ]>; // This is the return-value convention used for the entire X86 backend. def RetCC_X86 : CallingConv<[ // Check if this is the Intel OpenCL built-ins calling convention CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; //===----------------------------------------------------------------------===// // X86-64 Argument Calling Conventions //===----------------------------------------------------------------------===// def CC_X86_64_C : CallingConv<[ // Handles byval parameters. CCIfByVal>, // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in R10. CCIfNest>>, CCIfNest>, // The first 6 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, // The first 8 MMX vector arguments are passed in XMM registers on Darwin. CCIfType<[x86mmx], CCIfSubtarget<"isTargetDarwin()", CCIfSubtarget<"hasSSE2()", CCPromoteToType>>>, // Boolean vectors of AVX-512 are passed in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. CCIfType<[v2i1], CCPromoteToType>, CCIfType<[v4i1], CCPromoteToType>, CCIfType<[v8i1], CCPromoteToType>, CCIfType<[v16i1], CCPromoteToType>, CCIfType<[v32i1], CCPromoteToType>, CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, // The first 8 256-bit vector arguments are passed in YMM registers, unless // this is a vararg function. // FIXME: This isn't precisely correct; the x86-64 ABI document says that // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. CCIfNotVarArg>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. CCIfNotVarArg>>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, // Long doubles get stack slots whose size and alignment depends on the // subtarget. CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; // Calling convention for X86-64 HHVM. def CC_X86_64_HHVM : CallingConv<[ // Use all/any GP registers for args, except RSP. CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15, RDI, RSI, RDX, RCX, R8, R9, RAX, R10, R11, R13, R14]>> ]>; // Calling convention for helper functions in HHVM. def CC_X86_64_HHVM_C : CallingConv<[ // Pass the first argument in RBP. CCIfType<[i64], CCAssignToReg<[RBP]>>, // Otherwise it's the same as the regular C calling convention. CCDelegateTo ]>; // Calling convention used on Win64 def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. // FIXME: Handle varargs. // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in R10. CCIfNest>, // 128 bit vectors are passed by pointer CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect>, // 256 bit vectors are passed by pointer CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect>, // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType>, // The first 4 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], [XMM0, XMM1, XMM2, XMM3]>>, // Do not pass the sret argument in RCX, the Win64 thiscall calling // convention requires "this" to be passed in RCX. CCIfCC<"CallingConv::X86_ThisCall", CCIfSRet>>>, CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], [XMM0, XMM1, XMM2, XMM3]>>, // The first 4 FP/Vector arguments are passed in XMM registers. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], [RCX , RDX , R8 , R9 ]>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, // Long doubles get stack slots whose size and alignment depends on the // subtarget. CCIfType<[f80], CCAssignToStack<0, 0>> ]>; def CC_X86_Win64_VectorCall : CallingConv<[ // The first 6 floating point and vector types of 128 bits or less use // XMM0-XMM5. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, // 256-bit vectors use YMM registers. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, // 512-bit vectors use ZMM registers. CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, // Delegate to fastcall to handle integer types. CCDelegateTo ]>; def CC_X86_64_GHC : CallingConv<[ // Promote i8/i16/i32 arguments to i64. CCIfType<[i8, i16, i32], CCPromoteToType>, // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim CCIfType<[i64], CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>, // Pass in STG registers: F1, F2, F3, F4, D1, D2 CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> ]>; def CC_X86_64_HiPE : CallingConv<[ // Promote i8/i16/i32 arguments to i64. CCIfType<[i8, i16, i32], CCPromoteToType>, // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3 CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> ]>; def CC_X86_64_WebKit_JS : CallingConv<[ // Promote i8/i16 arguments to i32. CCIfType<[i8, i16], CCPromoteToType>, // Only the first integer argument is passed in register. CCIfType<[i32], CCAssignToReg<[EAX]>>, CCIfType<[i64], CCAssignToReg<[RAX]>>, // The remaining integer arguments are passed on the stack. 32bit integer and // floating-point arguments are aligned to 4 byte and stored in 4 byte slots. // 64bit integer and floating-point arguments are aligned to 8 byte and stored // in 8 byte stack slots. CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64], CCAssignToStack<8, 8>> ]>; // No explicit register is specified for the AnyReg calling convention. The // register allocator may assign the arguments to any free register. // // This calling convention is currently only supported by the stackmap and // patchpoint intrinsics. All other uses will result in an assert on Debug // builds. On Release builds we fallback to the X86 C calling convention. def CC_X86_64_AnyReg : CallingConv<[ CCCustom<"CC_X86_AnyReg_Error"> ]>; //===----------------------------------------------------------------------===// // X86 C Calling Convention //===----------------------------------------------------------------------===// /// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector /// values are spilled on the stack. def CC_X86_32_Vector_Common : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; // CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ // SSE vector arguments are passed in XMM registers. CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. CCIfNotVarArg>>, CCDelegateTo ]>; // CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in // vector registers. def CC_X86_32_Vector_Darwin : CallingConv<[ // SSE vector arguments are passed in XMM registers. CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. CCIfNotVarArg>>, CCDelegateTo ]>; /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP /// values are spilled on the stack. def CC_X86_32_Common : CallingConv<[ // Handles byval parameters. CCIfByVal>, // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg>>>>, // The first 3 __m64 vector arguments are passed in mmx registers if the // call is not a vararg call. CCIfNotVarArg>>, // Integer/Float values get stored in stack slots that are 4 bytes in // size and 4-byte aligned. CCIfType<[i32, f32], CCAssignToStack<4, 4>>, // Doubles get 8-byte slots that are 4-byte aligned. CCIfType<[f64], CCAssignToStack<8, 4>>, // Long doubles get slots whose size depends on the subtarget. CCIfType<[f80], CCAssignToStack<0, 4>>, // Boolean vectors of AVX-512 are passed in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. CCIfType<[v2i1], CCPromoteToType>, CCIfType<[v4i1], CCPromoteToType>, CCIfType<[v8i1], CCPromoteToType>, CCIfType<[v16i1], CCPromoteToType>, CCIfType<[v32i1], CCPromoteToType>, CCIfType<[v64i1], CCPromoteToType>, // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are // passed in the parameter area. CCIfType<[x86mmx], CCAssignToStack<8, 4>>, // Darwin passes vectors in a form that differs from the i386 psABI CCIfSubtarget<"isTargetDarwin()", CCDelegateTo>, // Otherwise, drop to 'normal' X86-32 CC CCDelegateTo ]>; def CC_X86_32_C : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in ECX. CCIfNest>, // The first 3 integer arguments, if marked 'inreg' and if the call is not // a vararg call, are passed in integer registers. CCIfNotVarArg>>>, // Otherwise, same as everything else. CCDelegateTo ]>; def CC_X86_32_MCU : CallingConv<[ // Handles byval parameters. Note that, like FastCC, we can't rely on // the delegation to CC_X86_32_Common because that happens after code that // puts arguments in registers. CCIfByVal>, // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // If the call is not a vararg call, some arguments may be passed // in integer registers. CCIfNotVarArg>>, // Otherwise, same as everything else. CCDelegateTo ]>; def CC_X86_32_FastCall : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest>, // The first 2 integer arguments are passed in ECX/EDX CCIfInReg>>, // Otherwise, same as everything else. CCDelegateTo ]>; def CC_X86_32_VectorCall : CallingConv<[ // The first 6 floating point and vector types of 128 bits or less use // XMM0-XMM5. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, // 256-bit vectors use YMM registers. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, // 512-bit vectors use ZMM registers. CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, // Otherwise, pass it indirectly. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCCustom<"CC_X86_32_VectorCallIndirect">>, // Delegate to fastcall to handle integer types. CCDelegateTo ]>; def CC_X86_32_ThisCall_Common : CallingConv<[ // The first integer argument is passed in ECX CCIfType<[i32], CCAssignToReg<[ECX]>>, // Otherwise, same as everything else. CCDelegateTo ]>; def CC_X86_32_ThisCall_Mingw : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, CCDelegateTo ]>; def CC_X86_32_ThisCall_Win : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // Pass sret arguments indirectly through stack. CCIfSRet>, CCDelegateTo ]>; def CC_X86_32_ThisCall : CallingConv<[ CCIfSubtarget<"isTargetCygMing()", CCDelegateTo>, CCDelegateTo ]>; def CC_X86_32_FastCC : CallingConv<[ // Handles byval parameters. Note that we can't rely on the delegation // to CC_X86_32_Common for this because that happens after code that // puts arguments in registers. CCIfByVal>, // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest>, // The first 2 integer arguments are passed in ECX/EDX CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, // The first 3 float or double arguments, if the call is not a vararg // call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg>>>, // Doubles get 8-byte slots that are 8-byte aligned. CCIfType<[f64], CCAssignToStack<8, 8>>, // Otherwise, same as everything else. CCDelegateTo ]>; def CC_X86_32_GHC : CallingConv<[ // Promote i8/i16 arguments to i32. CCIfType<[i8, i16], CCPromoteToType>, // Pass in STG registers: Base, Sp, Hp, R1 CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>> ]>; def CC_X86_32_HiPE : CallingConv<[ // Promote i8/i16 arguments to i32. CCIfType<[i8, i16], CCPromoteToType>, // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2 CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>, // Integer/Float values get stored in stack slots that are 4 bytes in // size and 4-byte aligned. CCIfType<[i32, f32], CCAssignToStack<4, 4>> ]>; // X86-64 Intel OpenCL built-ins calling convention. def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>, CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>, CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>, CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>, CCIfType<[i32], CCAssignToStack<4, 4>>, // The SSE vector arguments are passed in XMM registers. CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, // The 256-bit vector arguments are passed in YMM registers. CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, // The 512-bit vector arguments are passed in ZMM registers. CCIfType<[v16f32, v8f64, v16i32, v8i64], CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, // Pass masks in mask registers CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>, CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; def CC_X86_32_Intr : CallingConv<[ CCAssignToStack<4, 4> ]>; def CC_X86_64_Intr : CallingConv<[ CCAssignToStack<8, 8> ]>; //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ CCIfSubtarget<"isTargetMCU()", CCDelegateTo>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo>, CCIfCC<"CallingConv::Fast", CCDelegateTo>, CCIfCC<"CallingConv::GHC", CCDelegateTo>, CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, // Otherwise, drop to normal X86-32 CC CCDelegateTo ]>; // This is the root argument convention for the X86-64 backend. def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::GHC", CCDelegateTo>, CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo>, CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::HHVM", CCDelegateTo>, CCIfCC<"CallingConv::HHVM_C", CCDelegateTo>, CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, // Otherwise, drop to normal X86-64 CC CCDelegateTo ]>; // This is the argument convention used for the entire X86 backend. def CC_X86 : CallingConv<[ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; //===----------------------------------------------------------------------===// // Callee-saved Registers. //===----------------------------------------------------------------------===// def CSR_NoRegs : CalleeSavedRegs<(add)>; def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "XMM%u", 6, 15))>; // The function used by Darwin to obtain the address of a thread-local variable // uses rdi to pass a single parameter and rax for the return value. All other // GPRs are preserved. def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, R8, R9, R10, R11)>; // CSRs that are handled by prologue, epilogue. -def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>; +def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>; // CSRs that are handled explicitly via copies. -def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>; +def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>; // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, R8, R9, R10, RSP)>; // All registers - except r11 def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs, (sequence "XMM%u", 0, 15))>; def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs, (sequence "YMM%u", 0, 15))>; def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15, RBP, (sequence "XMM%u", 0, 15))>; def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI, EDI, ESP)>; def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs, (sequence "XMM%u", 0, 7))>; def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP, (sequence "XMM%u", 16, 31))>; def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP, (sequence "YMM%u", 0, 31)), (sequence "XMM%u", 0, 15))>; // Standard C + YMM6-15 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "YMM%u", 6, 15))>; def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "ZMM%u", 6, 21), K4, K5, K6, K7)>; //Standard C + XMM 8-15 def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, (sequence "XMM%u", 8, 15))>; //Standard C + YMM 8-15 def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, (sequence "YMM%u", 8, 15))>; def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, (sequence "ZMM%u", 16, 31), K4, K5, K6, K7)>; // Only R12 is preserved for PHP calls in HHVM. def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; Index: projects/clang380-import/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp (revision 294609) @@ -1,2699 +1,2715 @@ //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of TargetFrameLowering class. // //===----------------------------------------------------------------------===// #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" #include using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride) : TargetFrameLowering(StackGrowsDown, StackAlignOverride, STI.is64Bit() ? -8 : -4), STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { // Cache a bunch of frame-related predicates for this subtarget. SlotSize = TRI->getSlotSize(); Is64Bit = STI.is64Bit(); IsLP64 = STI.isTarget64BitLP64(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); StackPtr = TRI->getStackRegister(); } bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo()->hasVarSizedObjects() && !MF.getInfo()->getHasPushSequences(); } /// canSimplifyCallFramePseudos - If there is a reserved call frame, the /// call frame pseudos can be simplified. Having a FP, as in the default /// implementation, is not sufficient here since we can't always use it. /// Use a more nuanced condition. bool X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || (hasFP(MF) && !TRI->needsStackRealignment(MF)) || TRI->hasBasePointer(MF); } // needsFrameIndexResolution - Do we need to perform FI resolution for // this function. Normally, this is required only when the function // has any stack objects. However, FI resolution actually has another job, // not apparent from the title - it resolves callframesetup/destroy // that were not simplified earlier. // So, this is required for x86 functions that have push sequences even // when there are no stack objects. bool X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { return MF.getFrameInfo()->hasStackObjects() || MF.getInfo()->getHasPushSequences(); } /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo()->getForceFramePointer() || MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || MFI->hasStackMap() || MFI->hasPatchPoint() || MFI->hasCopyImplyingStackAdjustment()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::SUB64ri8; return X86::SUB64ri32; } else { if (isInt<8>(Imm)) return X86::SUB32ri8; return X86::SUB32ri; } } static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::ADD64ri8; return X86::ADD64ri32; } else { if (isInt<8>(Imm)) return X86::ADD32ri8; return X86::ADD32ri; } } static unsigned getSUBrrOpcode(unsigned isLP64) { return isLP64 ? X86::SUB64rr : X86::SUB32rr; } static unsigned getADDrrOpcode(unsigned isLP64) { return isLP64 ? X86::ADD64rr : X86::ADD32rr; } static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::AND64ri8; return X86::AND64ri32; } if (isInt<8>(Imm)) return X86::AND32ri8; return X86::AND32ri; } static unsigned getLEArOpcode(unsigned IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } /// findDeadCallerSavedReg - Return a caller-saved register that isn't live /// when it reaches the "return" instruction. We can then pop a stack object /// to this register without worry about clobbering it. static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const X86RegisterInfo *TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); if (!F || MF->getMMI().callsEHReturn()) return 0; const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); unsigned Opc = MBBI->getOpcode(); switch (Opc) { default: return 0; case X86::RETL: case X86::RETQ: case X86::RETIL: case X86::RETIQ: case X86::TCRETURNdi: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: { SmallSet Uses; for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MBBI->getOperand(i); if (!MO.isReg() || MO.isDef()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) Uses.insert(*AI); } for (auto CS : AvailableRegs) if (!Uses.count(CS) && CS != X86::RIP) return CS; } } return 0; } static bool isEAXLiveIn(MachineFunction &MF) { for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), EE = MF.getRegInfo().livein_end(); II != EE; ++II) { unsigned Reg = II->first; if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || Reg == X86::AH || Reg == X86::AL) return true; } return false; } /// Check if the flags need to be preserved before the terminators. /// This would be the case, if the eflags is live-in of the region /// composed by the terminators or live-out of that region, without /// being defined by a terminator. static bool flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB.terminators()) { bool BreakNext = false; for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg != X86::EFLAGS) continue; // This terminator needs an eflags that is not defined // by a previous another terminator: // EFLAGS is live-in of the region composed by the terminators. if (!MO.isDef()) return true; // This terminator defines the eflags, i.e., we don't need to preserve it. // However, we still need to check this specific terminator does not // read a live-in value. BreakNext = true; } // We found a definition of the eflags, no need to preserve them. if (BreakNext) return false; } // None of the terminators use or define the eflags. // Check if they are live-out, that would imply we need to preserve them. for (const MachineBasicBlock *Succ : MBB.successors()) if (Succ->isLiveIn(X86::EFLAGS)) return true; return false; } /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int64_t NumBytes, bool InEpilogue) const { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { if (Offset > Chunk) { // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; if (isSub && !isEAXLiveIn(*MBB.getParent())) Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); else Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); if (Reg) { unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) .addImm(Offset); Opc = isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addReg(Reg); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. Offset = 0; continue; } } uint64_t ThisVal = std::min(Offset, Chunk); if (ThisVal == (Is64Bit ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); if (Reg) { unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) : (Is64Bit ? X86::POP64r : X86::POP32r); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) MI->setFlag(MachineInstr::FrameSetup); else MI->setFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; continue; } } MachineInstrBuilder MI = BuildStackAdjustment( MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); if (isSub) MI.setMIFlag(MachineInstr::FrameSetup); else MI.setMIFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; } } MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, int64_t Offset, bool InEpilogue) const { assert(Offset != 0 && "zero offset stack adjustment requested"); // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue // is tricky. bool UseLEA; if (!InEpilogue) { // Check if inserting the prologue at the beginning // of MBB would require to use LEA operations. // We need to use LEA operations if EFLAGS is live in, because // it means an instruction will read it before it gets defined. UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); } else { // If we can use LEA for SP but we shouldn't, check that none // of the terminators uses the eflags. Otherwise we will insert // a ADD that will redefine the eflags and break the condition. // Alternatively, we could move the ADD, but this may not be possible // and is an optimization anyway. UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); if (UseLEA && !STI.useLeaForSP()) UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); // If that assert breaks, that means we do not do the right thing // in canUseAsEpilogue. assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && "We shouldn't have allowed this insertion point"); } MachineInstrBuilder MI; if (UseLEA) { MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(getLEArOpcode(Uses64BitFramePtr)), StackPtr), StackPtr, false, Offset); } else { bool IsSub = Offset < 0; uint64_t AbsOffset = IsSub ? -Offset : Offset; unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) : getADDriOpcode(Uses64BitFramePtr, AbsOffset); MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(AbsOffset); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } return MI; } int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, bool doMergeWithPrevious) const { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) return 0; MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr : std::next(MBBI); unsigned Opc = PI->getOpcode(); int Offset = 0; if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || Opc == X86::LEA32r || Opc == X86::LEA64_32r) && PI->getOperand(0).getReg() == StackPtr){ Offset += PI->getOperand(2).getImm(); MBB.erase(PI); if (!doMergeWithPrevious) MBBI = NI; } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && PI->getOperand(0).getReg() == StackPtr) { Offset -= PI->getOperand(2).getImm(); MBB.erase(PI); if (!doMergeWithPrevious) MBBI = NI; } return Offset; } void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, MCCFIInstruction CFIInst) const { MachineFunction &MF = *MBB.getParent(); unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } void X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); // Add callee saved registers to move list. const std::vector &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; // Calculate offsets. for (std::vector::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); } } MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); if (STI.isTargetWindowsCoreCLR()) { if (InProlog) { return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); } else { return emitStackProbeInline(MF, MBB, MBBI, DL, false); } } else { return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); } } void X86FrameLowering::inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const { const StringRef ChkStkStubSymbol = "__chkstk_stub"; MachineInstr *ChkStkStub = nullptr; for (MachineInstr &MI : PrologMBB) { if (MI.isCall() && MI.getOperand(0).isSymbol() && ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { ChkStkStub = &MI; break; } } if (ChkStkStub != nullptr) { MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); assert(std::prev(MBBI).operator==(ChkStkStub) && "MBBI expected after __chkstk_stub."); DebugLoc DL = PrologMBB.findDebugLoc(MBBI); emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); ChkStkStub->eraseFromParent(); } } MachineInstr *X86FrameLowering::emitStackProbeInline( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); assert(STI.is64Bit() && "different expansion needed for 32 bit"); assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); const TargetInstrInfo &TII = *STI.getInstrInfo(); const BasicBlock *LLVM_BB = MBB.getBasicBlock(); // RAX contains the number of bytes of desired stack adjustment. // The handling here assumes this value has already been updated so as to // maintain stack alignment. // // We need to exit with RSP modified by this amount and execute suitable // page touches to notify the OS that we're growing the stack responsibly. // All stack probing must be done without modifying RSP. // // MBB: // SizeReg = RAX; // ZeroReg = 0 // CopyReg = RSP // Flags, TestReg = CopyReg - SizeReg // FinalReg = !Flags.Ovf ? TestReg : ZeroReg // LimitReg = gs magic thread env access // if FinalReg >= LimitReg goto ContinueMBB // RoundBB: // RoundReg = page address of FinalReg // LoopMBB: // LoopReg = PHI(LimitReg,ProbeReg) // ProbeReg = LoopReg - PageSize // [ProbeReg] = 0 // if (ProbeReg > RoundReg) goto LoopMBB // ContinueMBB: // RSP = RSP - RAX // [rest of original MBB] // Set up the new basic blocks MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); MF.insert(MBBIter, RoundMBB); MF.insert(MBBIter, LoopMBB); MF.insert(MBBIter, ContinueMBB); // Split MBB and move the tail portion down to ContinueMBB. MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); // Some useful constants const int64_t ThreadEnvironmentStackLimit = 0x10; const int64_t PageSize = 0x1000; const int64_t PageMask = ~(PageSize - 1); // Registers we need. For the normal case we use virtual // registers. For the prolog expansion we use RAX, RCX and RDX. MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RegClass = &X86::GR64RegClass; const unsigned SizeReg = InProlog ? (unsigned)X86::RAX : MRI.createVirtualRegister(RegClass), ZeroReg = InProlog ? (unsigned)X86::RCX : MRI.createVirtualRegister(RegClass), CopyReg = InProlog ? (unsigned)X86::RDX : MRI.createVirtualRegister(RegClass), TestReg = InProlog ? (unsigned)X86::RDX : MRI.createVirtualRegister(RegClass), FinalReg = InProlog ? (unsigned)X86::RDX : MRI.createVirtualRegister(RegClass), RoundedReg = InProlog ? (unsigned)X86::RDX : MRI.createVirtualRegister(RegClass), LimitReg = InProlog ? (unsigned)X86::RCX : MRI.createVirtualRegister(RegClass), JoinReg = InProlog ? (unsigned)X86::RCX : MRI.createVirtualRegister(RegClass), ProbeReg = InProlog ? (unsigned)X86::RCX : MRI.createVirtualRegister(RegClass); // SP-relative offsets where we can save RCX and RDX. int64_t RCXShadowSlot = 0; int64_t RDXShadowSlot = 0; // If inlining in the prolog, save RCX and RDX. // Future optimization: don't save or restore if not live in. if (InProlog) { // Compute the offsets. We need to account for things already // pushed onto the stack at this point: return address, frame // pointer (if used), and callee saves. X86MachineFunctionInfo *X86FI = MF.getInfo(); const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); const bool HasFP = hasFP(MF); RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); RDXShadowSlot = RCXShadowSlot + 8; // Emit the saves. addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, RCXShadowSlot) .addReg(X86::RCX); addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, RDXShadowSlot) .addReg(X86::RDX); } else { // Not in the prolog. Copy RAX to a virtual reg. BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); } // Add code to MBB to check for overflow and set the new target stack pointer // to zero if so. BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) .addReg(ZeroReg, RegState::Undef) .addReg(ZeroReg, RegState::Undef); BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) .addReg(CopyReg) .addReg(SizeReg); BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) .addReg(TestReg) .addReg(ZeroReg); // FinalReg now holds final stack pointer value, or zero if // allocation would overflow. Compare against the current stack // limit from the thread environment block. Note this limit is the // lowest touched page on the stack, not the point at which the OS // will cause an overflow exception, so this is just an optimization // to avoid unnecessarily touching pages that are below the current // SP but already commited to the stack by the OS. BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) .addReg(0) .addImm(1) .addReg(0) .addImm(ThreadEnvironmentStackLimit) .addReg(X86::GS); BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); // Jump if the desired stack pointer is at or above the stack limit. BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); // Add code to roundMBB to round the final stack pointer to a page boundary. BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) .addReg(FinalReg) .addImm(PageMask); BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); // LimitReg now holds the current stack limit, RoundedReg page-rounded // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page // and probe until we reach RoundedReg. if (!InProlog) { BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) .addReg(LimitReg) .addMBB(RoundMBB) .addReg(ProbeReg) .addMBB(LoopMBB); } addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, false, -PageSize); // Probe by storing a byte onto the stack. BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) .addReg(ProbeReg) .addImm(1) .addReg(0) .addImm(0) .addReg(0) .addImm(0); BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); // If in prolog, restore RDX and RCX. if (InProlog) { addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), X86::RCX), X86::RSP, false, RCXShadowSlot); addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), X86::RDX), X86::RSP, false, RDXShadowSlot); } // Now that the probing is done, add code to continueMBB to update // the stack pointer for real. BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(SizeReg); // Add the control flow edges we need. MBB.addSuccessor(ContinueMBB); MBB.addSuccessor(RoundMBB); RoundMBB->addSuccessor(LoopMBB); LoopMBB->addSuccessor(ContinueMBB); LoopMBB->addSuccessor(LoopMBB); // Mark all the instructions added to the prolog as frame setup. if (InProlog) { for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { BeforeMBBI->setFlag(MachineInstr::FrameSetup); } for (MachineInstr &MI : *RoundMBB) { MI.setFlag(MachineInstr::FrameSetup); } for (MachineInstr &MI : *LoopMBB) { MI.setFlag(MachineInstr::FrameSetup); } for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); CMBBI != ContinueMBBI; ++CMBBI) { CMBBI->setFlag(MachineInstr::FrameSetup); } } // Possible TODO: physreg liveness for InProlog case. return ContinueMBBI; } MachineInstr *X86FrameLowering::emitStackProbeCall( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; if (Is64Bit) CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; else CallOp = X86::CALLpcrel32; const char *Symbol; if (Is64Bit) { if (STI.isTargetCygMing()) { Symbol = "___chkstk_ms"; } else { Symbol = "__chkstk"; } } else if (STI.isTargetCygMing()) Symbol = "_alloca"; else Symbol = "_chkstk"; MachineInstrBuilder CI; MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); // All current stack probes take AX and SP as input, clobber flags, and // preserve all registers. x86_64 probes leave RSP unmodified. if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { // For the large code model, we have to call through a register. Use R11, // as it is scratch in all supported calling conventions. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) .addExternalSymbol(Symbol); CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); } else { CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); } unsigned AX = Is64Bit ? X86::RAX : X86::EAX; unsigned SP = Is64Bit ? X86::RSP : X86::ESP; CI.addReg(AX, RegState::Implicit) .addReg(SP, RegState::Implicit) .addReg(AX, RegState::Define | RegState::Implicit) .addReg(SP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); if (Is64Bit) { // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp // themselves. It also does not clobber %rax so we can reuse it when // adjusting %rsp. BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(X86::RAX); } if (InProlog) { // Apply the frame setup flag to all inserted instrs. for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) ExpansionMBBI->setFlag(MachineInstr::FrameSetup); } return MBBI; } MachineInstr *X86FrameLowering::emitStackProbeInlineStub( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { assert(InProlog && "ChkStkStub called outside prolog!"); BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__chkstk_stub"); return MBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { // Win64 ABI has a less restrictive limitation of 240; 128 works equally well // and might require smaller successive adjustments. const uint64_t Win64MaxSEHOffset = 128; uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset); // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode. return SEHFrameOffset & -16; } // If we're forcing a stack realignment we can't rely on just the frame // info, we need to know the ABI stack alignment as well in case we // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum SlotSize. uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. unsigned StackAlign = getStackAlignment(); if (MF.getFunction()->hasFnAttribute("stackrealign")) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) MaxAlign = SlotSize; } return MaxAlign; } void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, unsigned Reg, uint64_t MaxAlign) const { uint64_t Val = -MaxAlign; unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) .addReg(Reg) .addImm(Val) .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); } /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. /* Here's a gist of what gets emitted: ; Establish frame pointer, if needed [if needs FP] push %rbp .cfi_def_cfa_offset 16 .cfi_offset %rbp, -16 .seh_pushreg %rpb mov %rsp, %rbp .cfi_def_cfa_register %rbp ; Spill general-purpose registers [for all callee-saved GPRs] pushq % [if not needs FP] .cfi_def_cfa_offset (offset from RETADDR) .seh_pushreg % ; If the required stack alignment > default stack alignment ; rsp needs to be re-aligned. This creates a "re-alignment gap" ; of unknown size in the stack frame. [if stack needs re-alignment] and $MASK, %rsp ; Allocate space for locals [if target is Windows and allocated space > 4096 bytes] ; Windows needs special care for allocations larger ; than one page. mov $NNN, %rax call ___chkstk_ms/___chkstk sub %rax, %rsp [else] sub $NNN, %rsp [if needs FP] .seh_stackalloc (size of XMM spill slots) .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots [else] .seh_stackalloc NNN ; Spill XMMs ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, ; they may get spilled on any platform, if the current function ; calls @llvm.eh.unwind.init [if needs FP] [for all callee-saved XMM registers] movaps %, -MMM(%rbp) [for all callee-saved XMM registers] .seh_savexmm %, (-MMM + SEHFrameOffset) ; i.e. the offset relative to (%rbp - SEHFrameOffset) [else] [for all callee-saved XMM registers] movaps %, KKK(%rsp) [for all callee-saved XMM registers] .seh_savexmm %, KKK .seh_endprologue [if needs base pointer] mov %rsp, %rbx [if needs to restore base pointer] mov %rsp, -MMM(%rbp) ; Emit CFI info [if needs FP] [for all callee-saved registers] .cfi_offset %, (offset from %rbp) [else] .cfi_def_cfa_offset (offset from RETADDR) [for all callee-saved registers] .cfi_offset %, (offset from %rsp) Notes: - .seh directives are emitted only for Windows 64 ABI - .cfi directives are emitted for all other ABIs - for 32-bit code, substitute %e?? registers for %r?? */ void X86FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&STI == &MF.getSubtarget() && "MF used frame lowering for wrong subtarget"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool IsFunclet = MBB.isEHFuncletEntry(); EHPersonality Personality = EHPersonality::Unknown; if (Fn->hasPersonalityFn()) Personality = classifyEHPersonality(Fn->getPersonalityFn()); bool FnHasClrFunclet = MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR; bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); unsigned FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc DL; // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta && IsWin64Prologue) report_fatal_error("Can't handle guaranteed tail call under win64 yet"); if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); // The default stack probe size is 4096 if the function has no stackprobesize // attribute. unsigned StackProbeSize = 4096; if (Fn->hasFnAttribute("stack-probe-size")) Fn->getFnAttribute("stack-probe-size") .getValueAsString() .getAsInteger(0, StackProbeSize); // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !TRI->needsStackRealignment(MF) && !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. !IsWin64CC && // Win64 has no Red Zone !MFI->hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); } // Insert stack pointer adjustment for later moving of return addr. Only // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta, /*InEpilogue=*/false) .setMIFlag(MachineInstr::FrameSetup); } // Mapping for machine moves: // // DST: VirtualFP AND // SRC: VirtualFP => DW_CFA_def_cfa_offset // ELSE => DW_CFA_def_cfa // // SRC: VirtualFP AND // DST: Register => DW_CFA_def_cfa_register // // ELSE // OFFSET < 0 => DW_CFA_offset_extended_sf // REG < 64 => DW_CFA_offset + Reg // ELSE => DW_CFA_offset_extended uint64_t NumBytes = 0; int stackGrowth = -SlotSize; // Find the funclet establisher parameter unsigned Establisher = X86::NoRegister; if (IsClrFunclet) Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; else if (IsFunclet) Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX; if (IsWin64Prologue && IsFunclet && !IsClrFunclet) { // Immediately spill establisher into the home slot. // The runtime cares about this. // MOV64mr %rdx, 16(%rsp) unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16) .addReg(Establisher) .setMIFlag(MachineInstr::FrameSetup); MBB.addLiveIn(Establisher); } if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); // Callee-saved registers are pushed on stack before the stack is realigned. if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = RoundUpToAlignment(NumBytes, MaxAlign); // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. if (!IsFunclet) MFI->setOffsetAdjustment(-NumBytes); else assert(MFI->getOffsetAdjustment() == -(int)NumBytes && "should calculate same local variable offset for funclets"); // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, DwarfFramePtr, 2 * stackGrowth)); } if (NeedsWinCFI) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } if (!IsWin64Prologue && !IsFunclet) { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( nullptr, DwarfFramePtr)); } } // Mark the FramePtr as live-in in every block. Don't do this again for // funclet prologues. if (!IsFunclet) { for (MachineBasicBlock &EveryMBB : MF) EveryMBB.addLiveIn(MachineFramePtr); } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } // For EH funclets, only allocate enough space for outgoing calls. Save the // NumBytes value that we would've used for the parent frame. unsigned ParentFrameNumBytes = NumBytes; if (IsFunclet) NumBytes = getWinEHFuncletFrameSize(MF); // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; unsigned Reg = MBBI->getOperand(0).getReg(); ++MBBI; if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); StackOffset += stackGrowth; } if (NeedsWinCFI) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( MachineInstr::FrameSetup); } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Don't do this for Win64, it needs to realign the stack after the prologue. if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, true); // Adjust stack pointer: ESP -= numbytes. // Windows and cygwin/mingw require a prologue helper routine when allocating // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the // stack and adjust the stack pointer in one go. The 64-bit version of // __chkstk is only responsible for probing the stack. The 64-bit prologue is // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. uint64_t AlignedNumBytes = NumBytes; if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); if (isEAXAlive) { // Sanity check that EAX is not livein for this function. // It should not be, so throw an assert. assert(!Is64Bit && "EAX is livein in x64 case!"); // Save EAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) .addReg(X86::EAX, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. if (isUInt<32>(NumBytes)) { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } else if (isInt<32>(NumBytes)) { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } else { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) .setMIFlag(MachineInstr::FrameSetup); } // Call __chkstk, __chkstk_ms, or __alloca. emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { // Restore EAX MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } } else if (NumBytes) { emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); } if (NeedsWinCFI && NumBytes) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); int SEHFrameOffset = 0; unsigned SPOrEstablisher; if (IsFunclet) { if (IsClrFunclet) { // The establisher parameter passed to a CLR funclet is actually a pointer // to the (mostly empty) frame of its nearest enclosing funclet; we have // to find the root function establisher frame by loading the PSPSym from // the intermediate frame. unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); MachinePointerInfo NoInfo; MBB.addLiveIn(Establisher); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), Establisher, false, PSPSlotOffset) .addMemOperand(MF.getMachineMemOperand( NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize)); ; // Save the root establisher back into the current funclet's (mostly // empty) frame, in case a sub-funclet or the GC needs it. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, PSPSlotOffset) .addReg(Establisher) .addMemOperand( MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, SlotSize, SlotSize)); } SPOrEstablisher = Establisher; } else { SPOrEstablisher = StackPtr; } if (IsWin64Prologue && HasFP) { // Set RBP to a small fixed offset from RSP. In the funclet case, we base // this calculation on the incoming establisher, which holds the value of // RSP from the parent frame at the end of the prologue. SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes); if (SEHFrameOffset) addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), SPOrEstablisher, false, SEHFrameOffset); else BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr) .addReg(SPOrEstablisher); // If this is not a funclet, emit the CFI describing our frame pointer. if (NeedsWinCFI && !IsFunclet) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); if (isAsynchronousEHPersonality(Personality)) MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset; } } else if (IsFunclet && STI.is32Bit()) { // Reset EBP / ESI to something good for funclets. MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL); // If we're a catch funclet, we can be returned to via catchret. Save ESP // into the registration node so that the runtime will restore it for us. if (!MBB.isCleanupFuncletEntry()) { assert(Personality == EHPersonality::MSVC_CXX); unsigned FrameReg; int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg); // ESP is the first field, so no extra displacement is needed. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, false, EHRegOffset) .addReg(X86::ESP); } } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { const MachineInstr *FrameInstr = &*MBBI; ++MBBI; if (NeedsWinCFI) { int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { unsigned IgnoredFrameReg; int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) .setMIFlag(MachineInstr::FrameSetup); } } } } if (NeedsWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); if (FnHasClrFunclet && !IsFunclet) { // Save the so-called Initial-SP (i.e. the value of the stack pointer // immediately after the prolog) into the PSPSlot so that funclets // and the GC can recover it. unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); auto PSPInfo = MachinePointerInfo::getFixedStack( MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, PSPSlotOffset) .addReg(StackPtr) .addMemOperand(MF.getMachineMemOperand( PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, SlotSize, SlotSize)); } // Realign stack after we spilled callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign); } // We already dealt with stack realignment and funclets above. if (IsFunclet && STI.is32Bit()) return; // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (TRI->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens // dependence chain. Used by SjLj EH. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); } if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { // Stash the value of the frame pointer relative to the base pointer for // Win32 EH. This supports Win32 EH, which does the inverse of the above: // it recovers the frame pointer from the base pointer rather than the // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; unsigned UsedReg; int Offset = getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); assert(UsedReg == BasePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) .addReg(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } } if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( nullptr, -StackSize + stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) emitCalleeSavedFrameMoves(MBB, MBBI, DL); } } bool X86FrameLowering::canUseLEAForSPInEpilogue( const MachineFunction &MF) const { // We can't use LEA instructions for adjusting the stack pointer if this is a // leaf function in the Win64 ABI. Only ADD instructions may be used to // deallocate the stack. // This means that we can use LEA for SP in two situations: // 1. We *aren't* using the Win64 ABI which means we are free to use LEA. // 2. We *have* a frame pointer which means we are permitted to use LEA. return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); } static bool isFuncletReturnInstr(MachineInstr *MI) { switch (MI->getOpcode()) { case X86::CATCHRET: case X86::CLEANUPRET: return true; default: return false; } llvm_unreachable("impossible"); } // CLR funclets use a special "Previous Stack Pointer Symbol" slot on the // stack. It holds a pointer to the bottom of the root function frame. The // establisher frame pointer passed to a nested funclet may point to the // (mostly empty) frame of its parent funclet, but it will need to find // the frame of the root function to access locals. To facilitate this, // every funclet copies the pointer to the bottom of the root function // frame into a PSPSym slot in its own (mostly empty) stack frame. Using the // same offset for the PSPSym in the root function frame that's used in the // funclets' frames allows each funclet to dynamically accept any ancestor // frame as its establisher argument (the runtime doesn't guarantee the // immediate parent for some reason lost to history), and also allows the GC, // which uses the PSPSym for some bookkeeping, to find it in any funclet's // frame with only a single offset reported for the entire method. unsigned X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); // getFrameIndexReferenceFromSP has an out ref parameter for the stack // pointer register; pass a dummy that we ignore unsigned SPReg; int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg); assert(Offset >= 0); return static_cast(Offset); } unsigned X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { // This is the size of the pushed CSRs. unsigned CSSize = MF.getInfo()->getCalleeSavedFrameSize(); // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = classifyEHPersonality(MF.getFunction()->getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { // CLR funclets need to hold enough space to include the PSPSym, at the // same offset from the stack pointer (immediately after the prolog) as it // resides at in the main function. UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; } else { // Other funclets just need enough stack for outgoing call arguments. UsedSize = MF.getFrameInfo()->getMaxCallFrameSize(); } // RBP is not included in the callee saved register block. After pushing RBP, // everything is 16 byte aligned. Everything we allocate before an outgoing // call must also be 16 byte aligned. unsigned FrameSizeMinusRBP = RoundUpToAlignment(CSSize + UsedSize, getStackAlignment()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. return FrameSizeMinusRBP - CSSize; } void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Is64BitILP32 = STI.isTarget64BitILP32(); unsigned FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinCFI = IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); bool IsFunclet = isFuncletReturnInstr(MBBI); MachineBasicBlock *TargetMBB = nullptr; // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; if (MBBI->getOpcode() == X86::CATCHRET) { // SEH shouldn't use catchret. assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && "SEH should not use CATCHRET"); NumBytes = getWinEHFuncletFrameSize(MF); assert(hasFP(MF) && "EH funclets without FP not yet implemented"); TargetMBB = MBBI->getOperand(0).getMBB(); // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); } else if (MBBI->getOpcode() == X86::CLEANUPRET) { NumBytes = getWinEHFuncletFrameSize(MF); assert(hasFP(MF) && "EH funclets without FP not yet implemented"); BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); } else if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; NumBytes = FrameSize - CSSize; // Callee-saved registers were pushed on stack before the stack was // realigned. if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = RoundUpToAlignment(FrameSize, MaxAlign); // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); } else { NumBytes = StackSize - CSSize; } uint64_t SEHStackAllocAmt = NumBytes; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && Opc != X86::DBG_VALUE && !PI->isTerminator()) break; --MBBI; } MachineBasicBlock::iterator FirstCSPop = MBBI; if (TargetMBB) { // Fill EAX/RAX with the address of the target block. unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX; if (STI.is64Bit()) { // LEA64r TargetMBB(%rip), %rax BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(TargetMBB) .addReg(0); } else { // MOV32ri $TargetMBB, %eax BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg) .addMBB(TargetMBB); } // Record that we've taken the address of TargetMBB and no longer just // reference it in a terminator. TargetMBB->setHasAddressTaken(); } if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI->hasVarSizedObjects()) NumBytes += mergeSPUpdates(MBB, MBBI, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. Don't do this if this was a funclet epilogue, since the funclets // will not do realignment or dynamic stack allocation. if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) && !IsFunclet) { if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); uint64_t LEAAmount = IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; // There are only two legal forms of epilogue: // - add SEHAllocationSize, %rsp // - lea SEHAllocationSize(%FramePtr), %rsp // // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence. // However, we may use this sequence if we have a frame pointer because the // effects of the prologue can safely be undone. if (LEAAmount != 0) { unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, false, LEAAmount); --MBBI; } else { unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); --MBBI; } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true); --MBBI; } // Windows unwinder will not invoke function's exception handler if IP is // either in prologue or in epilogue. This behavior causes a problem when a // call immediately precedes an epilogue, because the return address points // into the epilogue. To cope with that, we insert an epilogue marker here, // then replace it with a 'nop' if it ends up immediately after a CALL in the // final emitted code. if (NeedsWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); if (Offset) { MBBI = MBB.getFirstTerminator(); // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, true); emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true); } } // NOTE: this only has a subset of the full frame index logic. In // particular, the FI < 0 and AfterFPPop logic is handled in // X86RegisterInfo::eliminateFrameIndex, but not here. Possibly // (probably?) it should be moved into here. int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. if (TRI->hasBasePointer(MF)) FrameReg = TRI->getBaseRegister(); else if (TRI->needsStackRealignment(MF)) FrameReg = TRI->getStackRegister(); else FrameReg = TRI->getFrameRegister(MF); // Offset will hold the offset from the stack pointer at function entry to the // object. // We need to factor in additional offsets applied during the prologue to the // frame, base, and stack pointer depending on which is used. int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); const X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI->getStackSize(); bool HasFP = hasFP(MF); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int64_t FPDelta = 0; if (IsWin64Prologue) { assert(!MFI->hasCalls() || (StackSize % 16) == 8); // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; uint64_t NumBytes = FrameSize - CSSize; uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); if (FI && FI == X86FI->getFAIndex()) return -SEHFrameOffset; // FPDelta is the offset from the "traditional" FP location of the old base // pointer followed by return address and the location required by the // restricted Win64 prologue. // Add FPDelta to all offsets below that go through the frame pointer. FPDelta = FrameSize - SEHFrameOffset; assert((!MFI->hasCalls() || (FPDelta % 16) == 0) && "FPDelta isn't aligned per the Win64 ABI!"); } if (TRI->hasBasePointer(MF)) { assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } } else if (TRI->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } // FIXME: Support tail calls } else { if (!HasFP) return Offset + StackSize; // Skip the saved EBP. Offset += SlotSize; // Skip the RETADDR move area int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; } return Offset + FPDelta; } // Simplified from getFrameIndexReference keeping only StackPointer cases int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); // Does not include any dynamic realign. const uint64_t StackSize = MFI->getStackSize(); { #ifndef NDEBUG // LLVM arranges the stack as follows: // ... // ARG2 // ARG1 // RETADDR // PUSH RBP <-- RBP points here // PUSH CSRs // ~~~~~~~ <-- possible stack realignment (non-win64) // ... // STACK OBJECTS // ... <-- RSP after prologue points here // ~~~~~~~ <-- possible stack realignment (win64) // // if (hasVarSizedObjects()): // ... <-- "base pointer" (ESI/RBX) points here // DYNAMIC ALLOCAS // ... <-- RSP points here // // Case 1: In the simple case of no stack realignment and no dynamic // allocas, both "fixed" stack objects (arguments and CSRs) are addressable // with fixed offsets from RSP. // // Case 2: In the case of stack realignment with no dynamic allocas, fixed // stack objects are addressed with RBP and regular stack objects with RSP. // // Case 3: In the case of dynamic allocas and stack realignment, RSP is used // to address stack arguments for outgoing calls and nothing else. The "base // pointer" points to local variables, and RBP points to fixed objects. // // In cases 2 and 3, we can only answer for non-fixed stack objects, and the // answer we give is relative to the SP after the prologue, and not the // SP in the middle of the function. assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) || STI.isTargetWin64()) && "offset from fixed object to SP is not static"); // We don't handle tail calls, and shouldn't be seeing them either. int TailCallReturnAddrDelta = MF.getInfo()->getTCReturnAddrDelta(); assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); #endif } // Fill in FrameReg output argument. FrameReg = TRI->getStackRegister(); // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is // one word (eight bytes). Obj0 is the stack slot we're trying to // get to. // // ---------------------------------- // | BP | Obj0 | Obj1 | ... | ObjN | // ---------------------------------- // ^ ^ ^ ^ // A B C E // // A is the incoming stack pointer. // (B - A) is the local area offset (-8 for x86-64) [1] // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] // // |(E - B)| is the StackSize (absolute value, positive). For a // stack that grown down, this works out to be (B - E). [3] // // E is also the value of %rsp after stack has been set up, and we // want (C - E) -- the value we can add to %rsp to get to Obj0. Now // (C - E) == (C - A) - (B - A) + (B - E) // { Using [1], [2] and [3] above } // == getObjectOffset - LocalAreaOffset + StackSize // // Get the Offset from the StackPointer int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); return Offset + StackSize; } bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CalleeSavedFrameSize = 0; int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); if (hasFP(MF)) { // emitPrologue always spills frame register the first thing. SpillSlotOffset -= SlotSize; MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. unsigned FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); break; } } } // Assign slots for GPRs. It increases frame size. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; SpillSlotOffset -= SlotSize; CalleeSavedFrameSize += SlotSize; int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); } X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); // Assign slots for XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); // ensure alignment SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); // spill into slot SpillSlotOffset -= RC->getSize(); int SlotIndex = MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI->ensureMaxAlignment(RC->getAlignment()); } return true; } bool X86FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI // for us, and there are no XMM CSRs on Win32. if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows()) return true; // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, TRI); --MI; MI->setFlag(MachineInstr::FrameSetup); ++MI; } return true; } bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; if (isFuncletReturnInstr(MI) && STI.isOSWindows()) { // Don't restore CSRs in 32-bit EH funclets. Matches // spillCalleeSavedRegisters. if (STI.is32Bit()) return true; // Don't restore CSRs before an SEH catchret. SEH except blocks do not form // funclets. emitEpilogue transforms these to normal jumps. if (MI->getOpcode() == X86::CATCHRET) { const Function *Func = MBB.getParent()->getFunction(); bool IsSEH = isAsynchronousEHPersonality( classifyEHPersonality(Func->getPersonalityFn())); if (IsSEH) return true; } } DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; BuildMI(MBB, MI, DL, TII.get(Opc), Reg) .setMIFlag(MachineInstr::FrameDestroy); } return true; } void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) { // create RETURNADDR area // arg // arg // RETADDR // { ... // RETADDR area // ... // } // [EBP] MFI->CreateFixedObject(-TailCallReturnAddrDelta, TailCallReturnAddrDelta - SlotSize, true); } // Spill the BasePtr if it's used. if (TRI->hasBasePointer(MF)) { SavedRegs.set(TRI->getBaseRegister()); // Allocate a spill slot for EBP if we have a base pointer and EH funclets. if (MF.getMMI().hasEHFunclets()) { int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize); X86FI->setHasSEHFramePtrSave(true); X86FI->setSEHFramePtrSaveIndex(FI); } } } static bool HasNestArgument(const MachineFunction *MF) { const Function *F = MF->getFunction(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; I++) { if (I->hasNestAttr()) return true; } return false; } /// GetScratchRegister - Get a temp register for performing work in the /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); // Erlang stuff. if (CallingConvention == CallingConv::HiPE) { if (Is64Bit) return Primary ? X86::R14 : X86::R13; else return Primary ? X86::EBX : X86::EDI; } if (Is64Bit) { if (IsLP64) return Primary ? X86::R11 : X86::R12; else return Primary ? X86::R11D : X86::R12D; } bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || CallingConvention == CallingConv::Fast) { if (IsNested) report_fatal_error("Segmented stacks does not support fastcall with " "nested function."); return Primary ? X86::EAX : X86::ECX; } if (IsNested) return Primary ? X86::EDX : X86::EAX; return Primary ? X86::ECX : X86::EAX; } // The stack limit in the TCB is set to this many bytes above the actual stack // limit. static const uint64_t kSplitStackAvailable = 256; void X86FrameLowering::adjustForSegmentedStacks( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo *MFI = MF.getFrameInfo(); uint64_t StackSize; unsigned TlsReg, TlsOffset; DebugLoc DL; + // To support shrink-wrapping we would need to insert the new blocks + // at the right place and update the branches to PrologueMBB. + assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); if (MF.getFunction()->isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD() && !STI.isTargetDragonFly()) report_fatal_error("Segmented stacks not supported on this platform."); // Eventually StackSize will be calculated by a link-time pass; which will // also decide whether checking code needs to be injected into this particular // prologue. StackSize = MFI->getStackSize(); // Do not generate a prologue for functions with a stack of size zero if (StackSize == 0) return; MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); X86MachineFunctionInfo *X86FI = MF.getInfo(); bool IsNested = false; // We need to know if the function has a nest argument only in 64 bit mode. if (Is64Bit) IsNested = HasNestArgument(&MF); // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. for (const auto &LI : PrologueMBB.liveins()) { allocMBB->addLiveIn(LI); checkMBB->addLiveIn(LI); } if (IsNested) allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); MF.push_front(allocMBB); MF.push_front(checkMBB); // When the frame size is less than 256 we just compare the stack // boundary directly to the value of the stack pointer, per gcc. bool CompareStackPointer = StackSize < kSplitStackAvailable; // Read the limit off the current stacklet off the stack_guard location. if (Is64Bit) { if (STI.isTargetLinux()) { TlsReg = X86::FS; TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. } else if (STI.isTargetWin64()) { TlsReg = X86::GS; TlsOffset = 0x28; // pvArbitrary, reserved for application use } else if (STI.isTargetFreeBSD()) { TlsReg = X86::FS; TlsOffset = 0x18; } else if (STI.isTargetDragonFly()) { TlsReg = X86::FS; TlsOffset = 0x20; // use tls_tcb.tcb_segstack } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { if (STI.isTargetLinux()) { TlsReg = X86::GS; TlsOffset = 0x30; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x48 + 90*4; } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use } else if (STI.isTargetDragonFly()) { TlsReg = X86::FS; TlsOffset = 0x10; // use tls_tcb.tcb_segstack } else if (STI.isTargetFreeBSD()) { report_fatal_error("Segmented stacks not supported on FreeBSD i386."); } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) ScratchReg = X86::ESP; else BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || STI.isTargetDragonFly()) { BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. unsigned ScratchReg2; bool SaveScratch2; if (CompareStackPointer) { // The primary scratch register is available for holding the TLS offset. ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); // Unfortunately, with fastcc the second scratch register may hold an // argument. SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); } // If Scratch2 is live-in then it needs to be saved. assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && "Scratch register is live-in and not saved"); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) .addReg(ScratchReg2, RegState::Kill); BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) .addImm(TlsOffset); BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) .addReg(ScratchReg) .addReg(ScratchReg2).addImm(1).addReg(0) .addImm(0) .addReg(TlsReg); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); } } // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. if (Is64Bit) { // Functions with nested arguments use R10, so it needs to be saved across // the call to _morestack const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; if (IsNested) BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(StackSize); } // __morestack is in libgcc if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { // Under the large code model, we cannot assume that __morestack lives // within 2^31 bytes of the call site, so we cannot use pc-relative // addressing. We cannot perform the call via a temporary register, // as the rax register may be used to store the static chain, and all // other suitable registers may be either callee-save or used for // parameter passing. We cannot use the stack at this point either // because __morestack manipulates the stack directly. // // To avoid these issues, perform an indirect call via a read-only memory // location containing the address. // // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) .addReg(0) .addExternalSymbol("__morestack_addr") .addReg(0); MF.getMMI().setUsesMorestackAddr(true); } else { if (Is64Bit) BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack"); else BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__morestack"); } if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); else BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); allocMBB->addSuccessor(&PrologueMBB); checkMBB->addSuccessor(allocMBB); checkMBB->addSuccessor(&PrologueMBB); #ifdef XDEBUG MF.verify(); #endif } /// Erlang programs may need a special prologue to handle the stack size they /// might need at runtime. That is because Erlang/OTP does not implement a C /// stack but uses a custom implementation of hybrid stack/heap architecture. /// (for more information see Eric Stenman's Ph.D. thesis: /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) /// /// CheckStack: /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart /// OldStart: /// ... /// IncStack: /// call inc_stack # doubles the stack space /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo *MFI = MF.getFrameInfo(); DebugLoc DL; + + // To support shrink-wrapping we would need to insert the new blocks + // at the right place and update the branches to PrologueMBB. + assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); + // HiPE-specific values const unsigned HipeLeafWords = 24; const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; const unsigned Guaranteed = HipeLeafWords * SlotSize; unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? MF.getFunction()->arg_size() - CCRegisteredArgs : 0; unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; assert(STI.isTargetLinux() && "HiPE prologue is only supported on Linux operating systems."); // Compute the largest caller's frame that is needed to fit the callees' // frames. This 'MaxStack' is computed from: // // a) the fixed frame size, which is the space needed for all spilled temps, // b) outgoing on-stack parameter areas, and // c) the minimum stack space this function needs to make available for the // functions it calls (a tunable ABI property). if (MFI->hasCalls()) { unsigned MoreStackForCalls = 0; for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); MBBI != MBBE; ++MBBI) for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); MI != ME; ++MI) { if (!MI->isCall()) continue; // Get callee operand. const MachineOperand &MO = MI->getOperand(0); // Only take account of global function calls (no closures etc.). if (!MO.isGlobal()) continue; const Function *F = dyn_cast(MO.getGlobal()); if (!F) continue; // Do not update 'MaxStack' for primitive and built-in functions // (encoded with names either starting with "erlang."/"bif_" or not // having a ".", such as a simple .., or an // "_", such as the BIF "suspend_0") as they are executed on another // stack. if (F->getName().find("erlang.") != StringRef::npos || F->getName().find("bif_") != StringRef::npos || F->getName().find_first_of("._") == StringRef::npos) continue; unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; if (HipeLeafWords - 1 > CalleeStkArity) MoreStackForCalls = std::max(MoreStackForCalls, (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); } MaxStack += MoreStackForCalls; } // If the stack frame needed is larger than the guaranteed then runtime checks // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. if (MaxStack > Guaranteed) { MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); for (const auto &LI : PrologueMBB.liveins()) { stackCheckMBB->addLiveIn(LI); incStackMBB->addLiveIn(LI); } MF.push_front(incStackMBB); MF.push_front(stackCheckMBB); unsigned ScratchReg, SPReg, PReg, SPLimitOffset; unsigned LEAop, CMPop, CALLop; if (Is64Bit) { SPReg = X86::RSP; PReg = X86::RBP; LEAop = X86::LEA64r; CMPop = X86::CMP64rm; CALLop = X86::CALL64pcrel32; SPLimitOffset = 0x90; } else { SPReg = X86::ESP; PReg = X86::EBP; LEAop = X86::LEA32r; CMPop = X86::CMP32rm; CALLop = X86::CALLpcrel32; SPLimitOffset = 0x4c; } ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "HiPE prologue scratch register is live-in"); // Create new MBB for StackCheck: addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg, false, -MaxStack); // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). addExternalSymbol("inc_stack_0"); addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); incStackMBB->addSuccessor(&PrologueMBB, {99, 100}); incStackMBB->addSuccessor(incStackMBB, {1, 100}); } #ifdef XDEBUG MF.verify(); #endif } bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const { if (Offset <= 0) return false; if (Offset % SlotSize) return false; int NumPops = Offset / SlotSize; // This is only worth it if we have at most 2 pops. if (NumPops != 1 && NumPops != 2) return false; // Handle only the trivial case where the adjustment directly follows // a call. This is the most common one, anyway. if (MBBI == MBB.begin()) return false; MachineBasicBlock::iterator Prev = std::prev(MBBI); if (!Prev->isCall() || !Prev->getOperand(1).isRegMask()) return false; unsigned Regs[2]; unsigned FoundRegs = 0; auto RegMask = Prev->getOperand(1); auto &RegClass = Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; // Try to find up to NumPops free registers. for (auto Candidate : RegClass) { // Poor man's liveness: // Since we're immediately after a call, any register that is clobbered // by the call and not defined by it can be considered dead. if (!RegMask.clobbersPhysReg(Candidate)) continue; bool IsDef = false; for (const MachineOperand &MO : Prev->implicit_operands()) { if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) { IsDef = true; break; } } if (IsDef) continue; Regs[FoundRegs++] = Candidate; if (FoundRegs == (unsigned)NumPops) break; } if (FoundRegs == 0) return false; // If we found only one free register, but need two, reuse the same one twice. while (FoundRegs < (unsigned)NumPops) Regs[FoundRegs++] = Regs[0]; for (int i = 0; i < NumPops; ++i) BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); return true; } void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { bool reserveCallFrame = hasReservedCallFrame(MF); unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, ' and the // adjcallstackdown instruction into 'add ESP, ' // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. unsigned StackAlign = getStackAlignment(); Amount = RoundUpToAlignment(Amount, StackAlign); MachineModuleInfo &MMI = MF.getMMI(); const Function *Fn = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool DwarfCFI = !WindowsCFI && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); // If we have any exception handlers in this function, and we adjust // the SP before calls, we may need to indicate this to the unwinder // using GNU_ARGS_SIZE. Note that this may be necessary even when // Amount == 0, because the preceding function may have set a non-0 // GNU_ARGS_SIZE. // TODO: We don't need to reset this between subsequent functions, // if it didn't change. bool HasDwarfEHHandlers = !WindowsCFI && !MF.getMMI().getLandingPads().empty(); if (HasDwarfEHHandlers && !isDestroy && MF.getInfo()->getHasPushSequences()) BuildCFI(MBB, I, DL, MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); if (Amount == 0) return; // Factor out the amount that gets handled inside the sequence // (Pushes of argument for frame setup, callee pops for frame destroy) Amount -= InternalAmt; // TODO: This is needed only if we require precise CFA. // If this is a callee-pop calling convention, emit a CFA adjust for // the amount the callee popped. if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); if (Amount) { // Add Amount to SP to destroy a frame, and subtract to setup. int Offset = isDestroy ? Amount : -Amount; if (!(Fn->optForMinSize() && adjustStackWithPops(MBB, I, DL, Offset))) BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); } if (DwarfCFI && !hasFP(MF)) { // If we don't have FP, but need to generate unwind information, // we need to set the correct CFA offset after the stack adjustment. // How much we adjust the CFA offset depends on whether we're emitting // CFI only for EH purposes or for debugging. EH only requires the CFA // offset to be correct at each call site, while for debugging we want // it to be more precise. int CFAOffset = Amount; // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. if (CFAOffset) { CFAOffset = isDestroy ? -CFAOffset : CFAOffset; BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset)); } } return; } if (isDestroy && InternalAmt) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. // We are not tracking the stack pointer adjustment by the callee, so make // sure we restore the stack pointer immediately after the call, there may // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. MachineBasicBlock::iterator B = MBB.begin(); while (I != B && !std::prev(I)->isCall()) --I; BuildStackAdjustment(MBB, I, DL, -InternalAmt, /*InEpilogue=*/false); } } bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { assert(MBB.getParent() && "Block is not attached to a function!"); // Win64 has strict requirements in terms of epilogue and we are // not taking a chance at messing with them. // I.e., unless this block is already an exit block, we can't use // it as an epilogue. if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock()) return false; if (canUseLEAForSPInEpilogue(*MBB.getParent())) return true; // If we cannot use LEA to adjust SP, we may need to use ADD, which // clobbers the EFLAGS. Check that we do not need to preserve it, // otherwise, conservatively assume this is not // safe to insert the epilogue here. return !flagsNeedToBePreservedBeforeTheTerminators(MBB); } bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { // If we may need to emit frameless compact unwind information, give // up as this is currently broken: PR25614. - return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF); + return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) && + // The lowering of segmented stack and HiPE only support entry blocks + // as prologue blocks: PR26107. + // This limitation may be lifted if we fix: + // - adjustForSegmentedStacks + // - adjustForHiPEPrologue + MF.getFunction()->getCallingConv() != CallingConv::HiPE && + !MF.shouldSplitStack(); } MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, bool RestoreSP) const { assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env"); assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32"); assert(STI.is32Bit() && !Uses64BitFramePtr && "restoring EBP/ESI on non-32-bit target"); MachineFunction &MF = *MBB.getParent(); unsigned FramePtr = TRI->getFrameRegister(MF); unsigned BasePtr = TRI->getBaseRegister(); WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); // FIXME: Don't set FrameSetup flag in catchret case. int FI = FuncInfo.EHRegNodeFrameIndex; int EHRegSize = MFI->getObjectSize(FI); if (RestoreSP) { // MOV32rm -EHRegSize(%ebp), %esp addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP), X86::EBP, true, -EHRegSize) .setMIFlag(MachineInstr::FrameSetup); } unsigned UsedReg; int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg); int EndOffset = -EHRegOffset - EHRegSize; FuncInfo.EHRegNodeEndOffset = EndOffset; if (UsedReg == FramePtr) { // ADD $offset, %ebp unsigned ADDri = getADDriOpcode(false, EndOffset); BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr) .addReg(FramePtr) .addImm(EndOffset) .setMIFlag(MachineInstr::FrameSetup) ->getOperand(3) .setIsDead(); assert(EndOffset >= 0 && "end of registration object above normal EBP position!"); } else if (UsedReg == BasePtr) { // LEA offset(%ebp), %esi addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr), FramePtr, false, EndOffset) .setMIFlag(MachineInstr::FrameSetup); // MOV32rm SavedEBPOffset(%esi), %ebp assert(X86FI->getHasSEHFramePtrSave()); int Offset = getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); assert(UsedReg == BasePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), UsedReg, true, Offset) .setMIFlag(MachineInstr::FrameSetup); } else { llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr"); } return MBBI; } unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. unsigned Offset = 16; // RBP is immediately pushed. Offset += SlotSize; // All callee-saved registers are then pushed. Offset += MF.getInfo()->getCalleeSavedFrameSize(); // Every funclet allocates enough stack space for the largest outgoing call. Offset += getWinEHFuncletFrameSize(MF); return Offset; } void X86FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. const Function *Fn = MF.getFunction(); if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() || classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) return; // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset // relative to RSP after the prologue. Find the offset of the last fixed // object, so that we can allocate a slot immediately following it. If there // were no fixed objects, use offset -SlotSize, which is immediately after the // return address. Fixed objects have negative frame indices. MachineFrameInfo *MFI = MF.getFrameInfo(); int64_t MinFixedObjOffset = -SlotSize; for (int I = MFI->getObjectIndexBegin(); I < 0; ++I) MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I)); int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; int UnwindHelpFI = MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI; // Store -2 into UnwindHelp on function entry. We have to scan forwards past // other frame setup instructions. MachineBasicBlock &MBB = MF.front(); auto MBBI = MBB.begin(); while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) ++MBBI; DebugLoc DL = MBB.findDebugLoc(MBBI); addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)), UnwindHelpFI) .addImm(-2); } Index: projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 294609) @@ -1,28937 +1,28938 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include "X86IntrinsicsInfo.h" #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt ExperimentalVectorWideningLegalization( "x86-experimental-vector-widening-legalization", cl::init(false), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget->hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } if (Subtarget->isTargetKnownWindowsMSVC()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } if (Subtarget->isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); } else if (Subtarget->isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); } else { setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) // f32/f64 are legal, f80 is custom. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); else setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget->useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); if (!Subtarget->useSoftFloat()) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); if (!Subtarget->useSoftFloat()) { // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); if (X86ScalarSSEf32) { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } else { setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); } } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget->is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } else if (!Subtarget->is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. setOperationAction(ISD::ADDC, VT, Custom); setOperationAction(ISD::ADDE, VT, Custom); setOperationAction(ISD::SUBC, VT, Custom); setOperationAction(ISD::SUBE, VT, Custom); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); setOperationAction(ISD::BR_CC , MVT::i64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. setOperationAction(ISD::FREM , MVT::f32 , Promote); } else { setOperationAction(ISD::FREM , MVT::f32 , Expand); } setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationAction(ISD::CTTZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } if (Subtarget->hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget->hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); setOperationAction(ISD::SELECT , MVT::i16 , Custom); setOperationAction(ISD::SELECT , MVT::i32 , Custom); setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); setOperationAction(ISD::SETCC , MVT::f128 , Custom); setOperationAction(ISD::SETCCE , MVT::i8 , Custom); setOperationAction(ISD::SETCCE , MVT::i16 , Custom); setOperationAction(ISD::SETCCE , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); setOperationAction(ISD::SETCCE , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // Darwin ABI issue. setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); setOperationAction(ISD::JumpTable , MVT::i32 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); setOperationAction(ISD::JumpTable , MVT::i64 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); } // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (Subtarget->hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); if (Subtarget->is64Bit()) { setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::FR64RegClass); // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS , MVT::f64, Custom); setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f64, Custom); setOperationAction(ISD::FNEG , MVT::f32, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // Lower this to FGETSIGNx86 plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Expand FP immediates into loads from the stack, except for the special // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (!Subtarget->useSoftFloat()) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); setOperationAction(ISD::UNDEF, MVT::f64, Expand); setOperationAction(ISD::UNDEF, MVT::f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { if (Subtarget->is64Bit() && Subtarget->hasMMX()) { addRegisterClass(MVT::f128, &X86::FR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); setOperationAction(ISD::FNEG , MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); } addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); } setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::MUL , VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::LOAD, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { setOperationAction(ISD::MULHS, MMXTy, Expand); setOperationAction(ISD::AND, MMXTy, Expand); setOperationAction(ISD::OR, MMXTy, Expand); setOperationAction(ISD::XOR, MMXTy, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); setOperationAction(ISD::SELECT, MMXTy, Expand); setOperationAction(ISD::BITCAST, MMXTy, Expand); } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); addRegisterClass(MVT::v4i32, &X86::VR128RegClass); addRegisterClass(MVT::v2i64, &X86::VR128RegClass); setOperationAction(ISD::ADD, MVT::v16i8, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); setOperationAction(ISD::SUB, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FADD, MVT::v2f64, Legal); setOperationAction(ISD::FSUB, MVT::v2f64, Legal); setOperationAction(ISD::FMUL, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v16i8, Legal); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); setOperationAction(ISD::SETCC, MVT::v8i16, Custom); setOperationAction(ISD::SETCC, MVT::v4i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); // ISD::CTTZ v2i64 - scalarization is faster. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // We support custom legalizing of sext and anyext loads for specific // memory vector types which we can load as a scalar (or sequence of // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v2i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v2i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom // sequence to convert from v2i32 to v2f32. if (!Subtarget->is64Bit()) setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); // FIXME: these should be Legal, but that's only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } if (Subtarget->hasSSE2()) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v2i64, Custom); setOperationAction(ISD::SRL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } if (Subtarget->hasXOP()) { setOperationAction(ISD::ROTL, MVT::v16i8, Custom); setOperationAction(ISD::ROTL, MVT::v8i16, Custom); setOperationAction(ISD::ROTL, MVT::v4i32, Custom); setOperationAction(ISD::ROTL, MVT::v2i64, Custom); setOperationAction(ISD::ROTL, MVT::v32i8, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v4i64, Custom); } if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); addRegisterClass(MVT::v8f32, &X86::VR256RegClass); addRegisterClass(MVT::v4i64, &X86::VR256RegClass); addRegisterClass(MVT::v4f64, &X86::VR256RegClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); setOperationAction(ISD::LOAD, MVT::v4i64, Legal); setOperationAction(ISD::FADD, MVT::v8f32, Legal); setOperationAction(ISD::FSUB, MVT::v8f32, Legal); setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); setOperationAction(ISD::FRINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); setOperationAction(ISD::FRINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); setOperationAction(ISD::SETCC, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::f32, Legal); setOperationAction(ISD::FMA, MVT::f64, Legal); } if (Subtarget->hasInt256()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); setOperationAction(ISD::ADD, MVT::v16i16, Legal); setOperationAction(ISD::ADD, MVT::v32i8, Legal); setOperationAction(ISD::SUB, MVT::v4i64, Legal); setOperationAction(ISD::SUB, MVT::v8i32, Legal); setOperationAction(ISD::SUB, MVT::v16i16, Legal); setOperationAction(ISD::SUB, MVT::v32i8, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); setOperationAction(ISD::SMAX, MVT::v32i8, Legal); setOperationAction(ISD::SMAX, MVT::v16i16, Legal); setOperationAction(ISD::SMAX, MVT::v8i32, Legal); setOperationAction(ISD::UMAX, MVT::v32i8, Legal); setOperationAction(ISD::UMAX, MVT::v16i16, Legal); setOperationAction(ISD::UMAX, MVT::v8i32, Legal); setOperationAction(ISD::SMIN, MVT::v32i8, Legal); setOperationAction(ISD::SMIN, MVT::v16i16, Legal); setOperationAction(ISD::SMIN, MVT::v8i32, Legal); setOperationAction(ISD::UMIN, MVT::v32i8, Legal); setOperationAction(ISD::UMIN, MVT::v16i16, Legal); setOperationAction(ISD::UMIN, MVT::v8i32, Legal); // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); setOperationAction(ISD::ADD, MVT::v16i16, Custom); setOperationAction(ISD::ADD, MVT::v32i8, Custom); setOperationAction(ISD::SUB, MVT::v4i64, Custom); setOperationAction(ISD::SUB, MVT::v8i32, Custom); setOperationAction(ISD::SUB, MVT::v16i16, Custom); setOperationAction(ISD::SUB, MVT::v32i8, Custom); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::SMAX, MVT::v32i8, Custom); setOperationAction(ISD::SMAX, MVT::v16i16, Custom); setOperationAction(ISD::SMAX, MVT::v8i32, Custom); setOperationAction(ISD::UMAX, MVT::v32i8, Custom); setOperationAction(ISD::UMAX, MVT::v16i16, Custom); setOperationAction(ISD::UMAX, MVT::v8i32, Custom); setOperationAction(ISD::SMIN, MVT::v32i8, Custom); setOperationAction(ISD::SMIN, MVT::v16i16, Custom); setOperationAction(ISD::SMIN, MVT::v8i32, Custom); setOperationAction(ISD::UMIN, MVT::v32i8, Custom); setOperationAction(ISD::UMIN, MVT::v16i16, Custom); setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v4i64, Custom); setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v4i64, Custom); setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v4i64, Custom); setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. for (MVT VT : MVT::vector_valuetypes()) { if (VT.getScalarSizeInBits() >= 32) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } if (Subtarget->hasInt256()) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v4i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v4i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); } } if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); setOperationAction(ISD::SUB, MVT::i1, Custom); setOperationAction(ISD::ADD, MVT::i1, Custom); setOperationAction(ISD::MUL, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); setOperationAction(ISD::LOAD, MVT::v16i32, Legal); setOperationAction(ISD::LOAD, MVT::v16i1, Legal); setOperationAction(ISD::FADD, MVT::v16f32, Legal); setOperationAction(ISD::FSUB, MVT::v16f32, Legal); setOperationAction(ISD::FMUL, MVT::v16f32, Legal); setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); setOperationAction(ISD::FMUL, MVT::v8f64, Legal); setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); if (Subtarget->hasVLX()){ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } else { setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); if (Subtarget->hasVLX()) { setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); } } if (Subtarget->hasVLX()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); if (Subtarget->hasDQI()) { setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); } setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); setOperationAction(ISD::FRINT, MVT::v16f32, Legal); setOperationAction(ISD::FRINT, MVT::v8f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); setOperationAction(ISD::SELECT, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8i1, Custom); setOperationAction(ISD::SMAX, MVT::v16i32, Legal); setOperationAction(ISD::SMAX, MVT::v8i64, Legal); setOperationAction(ISD::UMAX, MVT::v16i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i64, Legal); setOperationAction(ISD::SMIN, MVT::v16i32, Legal); setOperationAction(ISD::SMIN, MVT::v8i64, Legal); setOperationAction(ISD::UMIN, MVT::v16i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); setOperationAction(ISD::SUB, MVT::v8i64, Legal); setOperationAction(ISD::SUB, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::SRL, MVT::v8i64, Custom); setOperationAction(ISD::SRL, MVT::v16i32, Custom); setOperationAction(ISD::SHL, MVT::v8i64, Custom); setOperationAction(ISD::SHL, MVT::v16i32, Custom); setOperationAction(ISD::SRA, MVT::v8i64, Custom); setOperationAction(ISD::SRA, MVT::v16i32, Custom); setOperationAction(ISD::AND, MVT::v8i64, Legal); setOperationAction(ISD::OR, MVT::v8i64, Legal); setOperationAction(ISD::XOR, MVT::v8i64, Legal); setOperationAction(ISD::AND, MVT::v16i32, Legal); setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); if (Subtarget->hasVLX()) { setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); } else { setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); } } // Subtarget->hasCDI() if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } // Custom lower several nodes. for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize == 1) { setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); // Do not attempt to custom lower other non-512-bit vectors if (!VT.is512BitVector()) continue; if (EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Legal); setOperationAction(ISD::MSCATTER, VT, Custom); } } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); setOperationAction(ISD::LOAD, MVT::v32i16, Legal); setOperationAction(ISD::LOAD, MVT::v64i8, Legal); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); setOperationAction(ISD::ADD, MVT::v32i16, Legal); setOperationAction(ISD::ADD, MVT::v64i8, Legal); setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); setOperationAction(ISD::UMAX, MVT::v64i8, Legal); setOperationAction(ISD::UMAX, MVT::v32i16, Legal); setOperationAction(ISD::SMIN, MVT::v64i8, Legal); setOperationAction(ISD::SMIN, MVT::v32i16, Legal); setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); if (Subtarget->hasVLX()) setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v8i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v8i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) { addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); setOperationAction(ISD::SELECT, MVT::v4i1, Custom); setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); setOperationAction(ISD::XOR, MVT::v8i32, Legal); setOperationAction(ISD::AND, MVT::v4i32, Legal); setOperationAction(ISD::OR, MVT::v4i32, Legal); setOperationAction(ISD::XOR, MVT::v4i32, Legal); setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Legal); setOperationAction(ISD::SMAX, MVT::v4i64, Legal); setOperationAction(ISD::UMAX, MVT::v2i64, Legal); setOperationAction(ISD::UMAX, MVT::v4i64, Legal); setOperationAction(ISD::SMIN, MVT::v2i64, Legal); setOperationAction(ISD::SMIN, MVT::v4i64, Legal); setOperationAction(ISD::UMIN, MVT::v2i64, Legal); setOperationAction(ISD::UMIN, MVT::v4i64, Legal); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget->is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); } if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { // For MacOSX, we don't want the normal expansion of a libcall to sincos. // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } } if (Subtarget->isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. // A predictable cmov does not hurt on an in-order CPU. // FIXME: Use a CPU attribute to trigger this, not a CPU model. PredictableSelectIsExpensive = !Subtarget->isAtom(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. verifyIntrinsicTables(); } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO() && Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && VT.getVectorNumElements() != 1 && VT.getVectorElementType().getSimpleVT() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); const unsigned NumElts = VVT.getVectorNumElements(); const MVT EltVT = VVT.getVectorElementType(); if (VVT.is512BitVector()) { if (Subtarget->hasAVX512()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; } if (Subtarget->hasBWI()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 32: return MVT::v32i1; case 64: return MVT::v64i1; } } if (VVT.is256BitVector() || VVT.is128BitVector()) { if (Subtarget->hasVLX()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 2: return MVT::v2i1; case 4: return MVT::v4i1; case 8: return MVT::v8i1; } if (Subtarget->hasBWI() && Subtarget->hasVLX()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; case 32: return MVT::v32i1; } } } return VT.changeVectorElementTypeToInteger(); } /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { if (Subtarget->is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) return MVT::v8f32; } if (Subtarget->hasSSE2()) return MVT::v4i32; if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; } } // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: // 8-byte and under are always assumed to be fast. *Fast = true; break; case 128: *Fast = !Subtarget->isUnalignedMem16Slow(); break; case 256: *Fast = !Subtarget->isUnalignedMem32Slow(); break; // TODO: What about AVX-512 (512-bit) accesses? } } // Misaligned accesses of any size are always allowed. return true; } /// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } bool X86TargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())); return Table; } /// This returns the relocation base for the given PIC jumptable, /// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget->isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } std::pair X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: RRC = &X86::VR128RegClass; break; } return std::make_pair(RRC, Cost); } bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const { if (!Subtarget->isTargetLinux()) return false; if (Subtarget->is64Bit()) { // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: Offset = 0x28; if (getTargetMachine().getCodeModel() == CodeModel::Kernel) AddressSpace = 256; else AddressSpace = 257; } else { // %gs:0x14 on i386 Offset = 0x14; AddressSpace = 256; } return true; } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (!Subtarget->isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h unsigned AddressSpace, Offset; if (Subtarget->is64Bit()) { // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: Offset = 0x48; if (getTargetMachine().getCodeModel() == CodeModel::Kernel) AddressSpace = 256; else AddressSpace = 257; } else { // %gs:0x24 on i386 Offset = 0x24; AddressSpace = 256; } return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, SDLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, MVT::i16)); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget->is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } } Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. // // Checking Function.hasStructRetAttr() here is insufficient because the IR // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (X86::GR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); X86ISD::NodeType opcode = X86ISD::RET_FLAG; if (CallConv == CallingConv::X86_INTR) opcode = X86ISD::IRET; return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; // If we are returning more than one value, we can definitely // not make a tail call see PR19530 if (UI->getNumOperands() > 4) return false; if (UI->getNumOperands() == 4 && UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } EVT X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) ReturnMVT = MVT::i8; else ReturnMVT = MVT::i32; EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // Assign locations to each value returned by this call. SmallVector RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); SDValue Val = Chain.getValue(0); if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); InFlag = Chain.getValue(2); InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(const SmallVectorImpl &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, /*isTailCall*/false, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE || CC == CallingConv::HHVM); } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: case CallingConv::X86_64_Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: return true; default: return canGuaranteeTCO(CC); } } /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; return true; } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; // If value is passed by pointer we have address passed instead of the value // itself. bool ExtendedInMem = VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1; if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // Calculate SP offset of interrupt parameter, re-arrange the slot normally // taken by a return address. int Offset = 0; if (CallConv == CallingConv::X86_INTR) { const X86Subtarget& Subtarget = static_cast(DAG.getSubtarget()); // X86 interrupts may take one or two arguments. // On the stack there will be no return address as in regular call. // Offset of last argument need to be set to -4/-8 bytes. // Where offset of the first argument out of two, should be set to 0 bytes. Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); } // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI->setObjectOffset(FI, Offset); } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI->setObjectOffset(FI, Offset); } SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, false, 0); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, const X86Subtarget *Subtarget) { assert(Subtarget->is64Bit()); if (Subtarget->isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, const X86Subtarget *Subtarget) { assert(Subtarget->is64Bit()); if (Subtarget->isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. // TODO: __vectorcall will change this. return None; } const Function *Fn = MF.getFunction(); bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); bool isSoftFloat = Subtarget->useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); const Function* Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && Subtarget->isTargetCygMing() && Fn->getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); if (CallConv == CallingConv::X86_INTR) { bool isLegal = Ins.size() == 1 || (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || (!Is64Bit && Ins[1].VT == MVT::i32))); if (!isLegal) report_fatal_error("X86 interrupts may take one or two arguments"); } // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeFormalArguments(Ins, CC_X86); unsigned LastVal = ~0U; SDValue ArgValue; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; // TODO: If an arg is passed in two places (e.g. reg and stack), skip later // places. assert(VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"); (void)LastVal; LastVal = VA.getValNo(); if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; else if (RegVT == MVT::f128) RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else if (RegVT == MVT::v32i1) RC = &X86::VK32RegClass; else if (RegVT == MVT::v64i1) RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // All x86 ABIs require that for returning structs by value we copy the // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. if (Ins[i].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. if (MFI->hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. assert(!(Subtarget->useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. if (Is64Bit && isVarArg && MFI->hasVAStart()) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. SmallVector LiveGPRs; SmallVector LiveXMMRegs; SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); LiveXMMRegs.push_back( DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } } if (IsWin64) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), Offset), false, false, 0); MemOps.push_back(Store); Offset += 8; } if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector SaveXMMOps; SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex(), dl)); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset(), dl)); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget->hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; else if (Subtarget->hasAVX()) VecVT = MVT::v8f32; else if (Subtarget->hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. SmallVector RegParmTypes; MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; RegParmTypes.push_back(IntVT); if (VecVT != MVT::Other) RegParmTypes.push_back(VecVT); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } // Copy all forwards from physical to virtual registers. for (ForwardedRegister &F : Forwards) { // FIXME: Can we use a less constrained schedule? SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { // X86 interrupts must pop the error code if present FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { assert(Is64Bit); // TODO: Add a mechanism to frame lowering that will allow us to indicate // that we'd prefer this slot be allocated towards the bottom of the frame // (i.e. near the stack pointer after allocating the frame). Every // funclet needs a copy of this slot in its (mostly empty) frame, and the // offset from the bottom of this and each funclet's frame must be the // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accomodate holding this slot at the correct offset). int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), false, false, 0); } /// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } /// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewReturnAddrFI), false, false, 0); return Chain; } /// Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); if (Attr.getValueAsString() == "true") isTailCall = false; if (Subtarget->isPICStyleGOT() && !MF.getTarget().Options.GuaranteedTailCallOpt) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code // that require lazy function symbol resolution. Using musttail or // GuaranteedTailCallOpt will override this. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G || (!G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility())) isTailCall = false; } bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; if (!ArgLocs.back().isMemLoc()) report_fatal_error("cannot use inalloca attribute on a register " "parameter"); if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); } if (!IsSibcall) Chain = DAG.getCALLSEQ_START( Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[i]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getBitcast(RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore( Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, 0); Arg = SpillSlot; break; } } if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair( unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); if (G && !G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, dl, MVT::i8))); } if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector MemOpChains2; SDValue FIN; int FI = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) continue; assert(VA.isMemLoc()); SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, 0)); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(DAG.getDataLayout()), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. GlobalAddressSDNode* G = cast(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportStorageClass()) { unsigned char OpFlags = 0; bool ExtraLoad = false; unsigned WrapperKind = ISD::DELETED_NODE; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && !GV->isStrongDefinitionForLinker() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa(GV) && cast(GV)->hasFnAttribute(Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). OpFlags = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; ExtraLoad = true; } Callee = DAG.getTargetGlobalAddress( GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); // Add a wrapper if needed. if (WrapperKind != ISD::DELETED_NODE) Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(DAG.getDataLayout()), Callee); // Add extra indirection if needed. if (ExtraLoad) Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; // On ELF targets, in either X86-64 or X86-32 mode, direct calls to // external symbols should go through the PLT. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } Callee = DAG.getTargetExternalSymbol( S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector Ops; if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); // If this is an invoke in a 32-bit function using a funclet-based // personality, assume the function clobbers all registers. If an exception // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { const Function *CallerFn = MF.getFunction(); EHPersonality Pers = CallerFn->hasPersonalityFn() ? classifyEHPersonality(CallerFn->getPersonalityFn()) : EHPersonality::Unknown; if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; } /// Return true if the given stack call argument is already available in the /// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII) { unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this // space. if (IsCalleeWin64 != IsCallerWin64) return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) return false; for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) return false; if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) return false; if (RVLocs1[i].isRegLoc()) { if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; } else { if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; } } } unsigned StackArgsSize = 0; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); StackArgsSize = CCInfo.getNextStackOffset(); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget->is64Bit() && ((!isa(Callee) && !isa(Callee)) || DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; unsigned Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } } bool CalleeWillPop = X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); if (unsigned BytesToPop = MF.getInfo()->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) return false; } else if (CalleeWillPop && StackArgsSize > 0) { // If we don't have bytes to pop, make sure the callee doesn't pop any. return false; } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: return true; } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, dl, MVT::i8)); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset >= 0) return true; return false; } /// Determines whether the callee is required to pop its own arguments. /// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { // If GuaranteeTCO is true, we force some calls to be callee pop so that we // can guarantee TCO. if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) return true; switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: return !is64Bit; } } /// \brief Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: return true; case X86::COND_G: return false; case X86::COND_GE: return false; case X86::COND_L: return false; case X86::COND_LE: return false; case X86::COND_NE: return true; case X86::COND_B: return true; case X86::COND_A: return true; case X86::COND_BE: return true; case X86::COND_AE: return true; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) return false; switch (IntrData->Type) { case LOADA: case LOADU: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); Info.vol = false; Info.readMem = true; Info.writeMem = false; return true; } default: break; } return false; } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; return true; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget->hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget->hasLZCNT(); } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (0 <= Mask[i]) return false; return true; } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } /// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector /// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa(N->getOperand(1).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } /// Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa(N->getOperand(2).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } bool X86::isVINSERT128Index(SDNode *N) { return isVINSERTIndex(N, 128); } bool X86::isVINSERT256Index(SDNode *N) { return isVINSERTIndex(N, 256); } bool X86::isVEXTRACT128Index(SDNode *N) { return isVEXTRACTIndex(N, 128); } bool X86::isVEXTRACT256Index(SDNode *N) { return isVEXTRACTIndex(N, 256); } static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); assert(isa(N->getOperand(1).getNode()) && "Illegal extract subvector for VEXTRACT"); uint64_t Index = cast(N->getOperand(1).getNode())->getZExtValue(); MVT VecVT = N->getOperand(0).getSimpleValueType(); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); assert(isa(N->getOperand(2).getNode()) && "Illegal insert subvector for VINSERT"); uint64_t Index = cast(N->getOperand(2).getNode())->getZExtValue(); MVT VecVT = N->getSimpleValueType(0); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } /// Return the appropriate immediate to extract the specified /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } /// Return the appropriate immediate to extract the specified /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } /// Return the appropriate immediate to insert at the specified /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } /// Return the appropriate immediate to insert at the specified /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants // Use an UNDEF node if MaskElt == -1. // Spilt 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, SDLoc dl, bool IsMask = false) { SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) && "Unexpected vector type"); SDValue Cst = DAG.getConstant(0, dl, MVT::i1); SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); return DAG.getBitcast(VT, Vec); } static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue InsertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. // We are still creating an INSERT_SUBVECTOR below with an undef node to // extend the subvector to the size of the result vector. Make sure that // we are not recursing on that node by checking for undef here. if (IdxVal == 0 && Result.getValueType().is256BitVector() && Result.getOpcode() != ISD::UNDEF) { EVT ResultVT = Result.getValueType(); SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(ResultVT); SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, Vec, ZeroIndex); // The blend instruction, and therefore its mask, depend on the data type. MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); if (ScalarType.isFloatingPoint()) { // Choose either vblendps (float) or vblendpd (double). unsigned ScalarSize = ScalarType.getSizeInBits(); assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); } const X86Subtarget &Subtarget = static_cast(DAG.getSubtarget()); // AVX2 is needed for 256-bit integer blend support. // Integers must be cast to 32-bit because there is only vpblendd; // vpblendw can't be used for this because it has a handicapped mask. // If we don't have AVX2, then cast to float. Using a wrong domain blend // is still more efficient than using the wrong domain vinsertf128 that // will be created by InsertSubVector(). MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); Result = DAG.getBitcast(CastVT, Result); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); } return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } /// Insert i1-subvector to i1-vector. static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (!isa(Idx)) return SDValue(); unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); // There are 3 possible cases: // 1. Subvector should be inserted in the lower part (IdxVal == 0) // 2. Subvector should be inserted in the upper part // (IdxVal + SubVecNumElems == NumElems) // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(OpVT); SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); if (ISD::isBuildVectorAllZeros(Vec.getNode())) { unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, DAG.getConstant(ShiftLeft, dl, MVT::i8)); return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; } if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); // Merge them together return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); } // Subvector should be inserted in the middle - use shuffle SmallVector Mask; for (unsigned i = 0; i < NumElems; ++i) Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? i : i + NumElems); return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower /// large BUILD_VECTORS. static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; if (VT.is512BitVector()) { SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.is256BitVector()) { if (Subtarget->hasInt256()) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } } else if (VT.is128BitVector()) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else llvm_unreachable("Unexpected vector type"); return DAG.getBitcast(VT, Vec); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; for (unsigned i = 0, e = NumElems/2; i != e; ++i) { Mask.push_back(i); Mask.push_back(i + NumElems); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { Mask.push_back(i + Half); Mask.push_back(i + NumElems + Half); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector MaskVec; for (unsigned i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec.push_back(i == Idx ? NumElems : i); return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// Returns true if the Mask could be calculated. Sets IsUnary to true if only /// uses one source. Note that this will set IsUnary for shuffles which use a /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast(ImmN)->getZExtValue(), Mask); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::INSERTPS: ImmN = N->getOperand(N->getNumOperands()-1); DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFB: { IsUnary = true; SDValue MaskNode = N->getOperand(1); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(0); if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. MVT VT = MaskNode.getSimpleValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); if (!VT.isInteger()) return false; int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; SmallVector RawMask; for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { RawMask.push_back((uint64_t)SM_SentinelUndef); continue; } auto *CN = dyn_cast(Op.getNode()); if (!CN) return false; APInt MaskElement = CN->getAPIntValue(); // We now have to decode the element which could be any integer size and // extract each byte of it. for (int j = 0; j < NumBytesPerElement; ++j) { // Note that this is x86 and so always little endian: the low byte is // the first byte of the mask. RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); MaskElement = MaskElement.lshr(8); } } DecodePSHUFBMask(RawMask, Mask); break; } auto *MaskLoad = dyn_cast(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); break; } return false; } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: DecodeMOVDDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: // Not yet implemented return false; case X86ISD::VPERMV: { IsUnary = true; SDValue MaskNode = N->getOperand(0); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(0); unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); SmallVector RawMask; if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. assert(MaskNode.getSimpleValueType().isInteger() && MaskNode.getSimpleValueType().getVectorNumElements() == VT.getVectorNumElements()); for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) RawMask.push_back((uint64_t)SM_SentinelUndef); else if (isa(Op)) { APInt MaskElement = cast(Op)->getAPIntValue(); RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); } else return false; } DecodeVPERMVMask(RawMask, Mask); break; } if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { unsigned NumEltsInMask = MaskNode->getNumOperands(); MaskNode = MaskNode->getOperand(0); if (auto *CN = dyn_cast(MaskNode)) { APInt MaskEltValue = CN->getAPIntValue(); for (unsigned i = 0; i < NumEltsInMask; ++i) RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); DecodeVPERMVMask(RawMask, Mask); break; } // It may be a scalar load } auto *MaskLoad = dyn_cast(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMVMask(C, VT, Mask); break; } return false; } case X86ISD::VPERMV3: { IsUnary = false; SDValue MaskNode = N->getOperand(1); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(1); if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. assert(MaskNode.getSimpleValueType().isInteger() && MaskNode.getSimpleValueType().getVectorNumElements() == VT.getVectorNumElements()); SmallVector RawMask; unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) RawMask.push_back((uint64_t)SM_SentinelUndef); else { auto *CN = dyn_cast(Op.getNode()); if (!CN) return false; APInt MaskElement = CN->getAPIntValue(); RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); } } DecodeVPERMV3Mask(RawMask, Mask); break; } auto *MaskLoad = dyn_cast(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMV3Mask(C, VT, Mask); break; } return false; } default: llvm_unreachable("unknown target shuffle node"); } // Empty mask indicates the decode failed. if (Mask.empty()) return false; // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) if (std::any_of(Mask.begin(), Mask.end(), [](int M){ return M == SM_SentinelZero; })) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); return true; } /// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufVT.getVectorElementType()); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget->hasSSE41()) { for (unsigned i = 0; i < 16; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { if (First) { if (NumZero) V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v16i8); First = false; } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } return V; } // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } if ((i & 1) != 0) { SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i-1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; if (ThisElt.getNode()) V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, DAG.getIntPtrConstant(i/2, dl)); } } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { if (First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } return V; } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i=0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op->getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = cast(Elt.getOperand(1))->getZExtValue(); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget->hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = SrcVector == V1 && cast(Current.getOperand(1))->getZExtValue() == i; } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || LD->isVolatile()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI->isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI->setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); SmallVector Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a /// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. // If we don't find an initial load element, or later load elements are // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; // Look through a bitcast. if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) Elt = Elt.getOperand(0); if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); if (!LDBase) { if (Elt.getNode()->getOpcode() == ISD::UNDEF) return SDValue(); LDBase = cast(Elt.getNode()); LastLoadedElt = i; continue; } if (Elt.getOpcode() == ISD::UNDEF) continue; LoadSDNode *LD = cast(Elt); EVT LdVT = Elt.getValueType(); // Each loaded element must be the correct fractional portion of the // requested vector load. if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) return SDValue(); if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) return SDValue(); LastLoadedElt = i; } // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { assert(LDBase && "Did not find base load for merging consecutive loads"); EVT EltVT = LDBase->getValueType(0); // Ensure that the input vector size for the merged loads matches the // cumulative size of the input elements. if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) return SDValue(); if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) return SDValue(); SDValue NewLd = SDValue(); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); } return NewLd; } //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. EVT EltVT = VT.getVectorElementType(); if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); // Make sure the newly-created LOAD is in the same position as LDBase in // terms of dependency. We create a TokenFactor for LDBase and ResNode, and // update uses of LDBase's output chain to use the TokenFactor. if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); } return DAG.getBitcast(VT, ResNode); } return SDValue(); } /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction /// to generate a splat value for the following cases: /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; bool ConstSplatVal; switch (Op.getOpcode()) { default: // Unknown pattern found. return SDValue(); case ISD::BUILD_VECTOR: { auto *BVOp = cast(Op.getNode()); BitVector UndefElements; SDValue Splat = BVOp->getSplatValue(&UndefElements); // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } case ISD::VECTOR_SHUFFLE: { ShuffleVectorSDNode *SVOp = cast(Op); // Shuffles must have a splat mask where the first element is // broadcasted. if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) return SDValue(); SDValue Sc = Op.getOperand(0); if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && Sc.getOpcode() != ISD::BUILD_VECTOR) { if (!Subtarget->hasInt256()) return SDValue(); // Use the register form of the broadcast instruction available on AVX2. if (VT.getSizeInBits() >= 256) Sc = Extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. // AVX-512 has register version of the broadcast bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && Ld.getValueType().getSizeInBits() >= 32; if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && !hasRegVer)) return SDValue(); break; } } unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget->hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast(ExtIdx)->getZExtValue(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %vreg1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %vreg0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { unsigned Idx = InsertIndices[i]; NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); } return NV; } static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); uint64_t Immediate = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() != ISD::UNDEF) Immediate |= cast(In)->getZExtValue() << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); return DAG.getConstant(Immediate, dl, VT); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1); SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1); SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } // Vector has one or more non-const elements uint64_t Immediate = 0; SmallVector NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; if (!isa(In)) NonConstIdx.push_back(idx); else { Immediate |= cast(In)->getZExtValue() << idx; HasConstElts = true; } if (SplatIdx == -1) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; SDValue Imm; if (Immediate) { MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); Imm = DAG.getConstant(Immediate, dl, ImmVT); } else if (HasConstElts) Imm = DAG.getConstant(0, dl, VT); else Imm = DAG.getUNDEF(VT); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) DstVec = DAG.getBitcast(VT, Imm); else { SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } for (unsigned i = 0; i < NonConstIdx.size(); ++i) { unsigned InsertIdx = NonConstIdx[i]; DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getIntPtrConstant(InsertIdx, dl)); } return DstVec; } /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. /// /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->getOpcode() == ISD::UNDEF) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.getOpcode() == ISD::UNDEF) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.getOpcode() == ISD::UNDEF) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || V1_LO->getOpcode() != ISD::UNDEF)) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || V1_HI->getOpcode() != ISD::UNDEF)) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB /// node. static SDValue LowerToAddSub(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); SDLoc DL(BV); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting two integer/float elements. unsigned ExpectedOpcode = ISD::FSUB; unsigned NextExpectedOpcode = ISD::FADD; bool AddFound = false; bool SubFound = false; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return SDValue(); unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return SDValue(); // We found a valid add/sub node. Update the information accordingly. if (i & 1) AddFound = true; else SubFound = true; // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return SDValue(); } if (InVec1.getOpcode() == ISD::UNDEF) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return SDValue(); } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) return SDValue(); // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return SDValue(); } if (InVec1 != Op1.getOperand(0)) return SDValue(); // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && InVec1.getOpcode() != ISD::UNDEF) return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); return SDValue(); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; unsigned Half = NumElts/2; // Count the number of UNDEF operands in the build_vector in input. for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsHI++; // Early exit if this is either a build_vector of all UNDEFs or all the // operands but one are UNDEF. if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) return SDValue(); SDLoc DL(BV); SDValue InVec0, InVec1; if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } if (!Subtarget->hasAVX()) return SDValue(); if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; if (!VT.is512BitVector()) return getOnesVector(VT, Subtarget, DAG, dl); } BuildVectorSDNode *BV = cast(Op.getNode()); if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet Values; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() == ISD::UNDEF) continue; Values.insert(Elt); if (Elt.getOpcode() != ISD::Constant && Elt.getOpcode() != ISD::ConstantFP) IsAllConstants = false; if (X86::isZeroNode(Elt)) NumZero++; else { assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of // the value are obviously zero, truncate the value to i32 and do the // insertion that way. Only do this if the value is non-constant or if the // value is a constant being inserted into element 0. It is cheaper to do // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); MVT VecVT = MVT::v4i32; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( Item, Idx * 2, true, Subtarget, DAG)); } } // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { if (VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0, dl)); } assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); if (VT.is256BitVector()) { if (Subtarget->hasAVX()) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } else { // Without AVX, we need to extend to a 128-bit vector and then // insert into the 256-bit vector. Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); // For AVX-length vectors, see if we can use a vector load to get all of the // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { SmallVector V(Op->op_begin(), Op->op_begin() + NumElems); // Check for a build vector of consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[0], NumElems/2)); SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget, *this)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget, *this)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { default: break; case 0: V[i] = V[i*2]; // Must be a zero vector. break; case 1: V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); break; case 2: V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); break; case 3: V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } if (Values.size() > 1 && VT.is128BitVector()) { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). for (unsigned i = 0; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() != ISD::UNDEF) V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else V[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 2 ==> X: // : unpcklps 1, 3 ==> Y: // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> unsigned EltStride = NumElems >> 1; while (EltStride != 0) { for (unsigned i = 0; i < EltStride; ++i) { // If V[i+EltStride] is undef and this is the first round of mixing, // then it is safe to just drop this shuffle: V[i] is already in the // right place, the one element (since it's the first round) being // inserted as undef can be dropped. This isn't safe for successive // rounds because they will permute elements within both vectors. if (V[i+EltStride].getOpcode() == ISD::UNDEF && EltStride == NumElems/2) continue; V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); } EltStride >>= 1; } return V[0]; } return SDValue(); } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); if (ResVT.is256BitVector()) return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); } return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); unsigned NumOfOperands = Op.getNumOperands(); assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { // Specialize the cases when all, or all but one, of the operands are undef. unsigned NumOfDefinedOps = 0; unsigned OpIdx = 0; for (unsigned i = 0; i < NumOfOperands; i++) if (!Op.getOperand(i).isUndef()) { NumOfDefinedOps++; OpIdx = i; } if (NumOfDefinedOps == 0) return Undef; if (NumOfDefinedOps == 1) { unsigned SubVecNumElts = Op.getOperand(OpIdx).getValueType().getVectorNumElements(); SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, Op.getOperand(OpIdx), IdxVal); } MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SmallVector Ops; for (unsigned i = 0; i < NumOfOperands/2; i++) Ops.push_back(Op.getOperand(i)); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); Ops.clear(); for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) Ops.push_back(Op.getOperand(i)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } // 2 operands SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); assert(V1.getValueType() == V2.getValueType() && V1.getValueType().getVectorNumElements() == NumElems/2 && "Unexpected operands in CONCAT_VECTORS"); if (ResVT.getSizeInBits() >= 16) return Op; // The operation is legal with KUNPCK bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); if (IsZeroV1 && IsZeroV2) return ZeroVec; SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); if (V2.isUndef()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); if (IsZeroV2) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); if (V1.isUndef()) V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); if (IsZeroV1) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// \brief Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] != -1 && Mask[i] != i) return false; return true; } /// \brief Helper function to classify a mask as a single-input mask. /// /// This isn't a generic single-input test because in the vector shuffle /// lowering we canonicalize single inputs to be the first input operand. This /// means we can more quickly test for a single input by only checking whether /// an input from the second operand exists. We also assume that the size of /// mask corresponds to the size of the input vectors which isn't true in the /// fully general case. static bool isSingleInputShuffleMask(ArrayRef Mask) { for (int M : Mask) if (M >= (int)Mask.size()) return false; return true; } /// \brief Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { int LaneSize = 128 / VT.getScalarSizeInBits(); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. /// /// This checks a shuffle mask to see if it is performing the same /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// *not* suitable for use with existing 128-bit shuffles as it will contain /// entries from both V1 and V2 inputs to the wider mask. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = 128 / VT.getScalarSizeInBits(); RepeatedMask.resize(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. if (RepeatedMask[i % LaneSize] == -1) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) // Found a mismatch with the repeated mask. return false; } return true; } /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef Mask, ArrayRef ExpectedMask) { if (Mask.size() != ExpectedMask.size()) return false; int Size = Mask.size(); // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. auto *BV1 = dyn_cast(V1); auto *BV2 = dyn_cast(V2); for (int i = 0; i < Size; ++i) if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (!MaskBV || !ExpectedBV || MaskBV->getOperand(Mask[i] % Size) != ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } return true; } /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, SDLoc DL, SelectionDAG &DAG) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; return DAG.getConstant(Imm, DL, MVT::i8); } /// \brief Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); while (V1.getOpcode() == ISD::BITCAST) V1 = V1->getOperand(0); while (V2.getOpcode() == ISD::BITCAST) V2 = V2->getOperand(0); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { Zeroable[i] = true; continue; } // If this is an index into a build_vector node (which has the same number // of elements), dig out the input value and use it. SDValue V = M < Size ? V1 : V2; if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) continue; SDValue Input = V.getOperand(M % Size); // The UNDEF opcode check really should be dead code here, but not quite // worth asserting on (it isn't invalid, just unexpected). if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) Zeroable[i] = true; } return Zeroable; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); SmallVector Unpckl; SmallVector Unpckh; for (int i = 0; i < NumElts; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); int HiPos = LoPos + NumEltsInLane / 2; Unpckl.push_back(LoPos); Unpckh.push_back(HiPos); } if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } /// \brief Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); MVT IntEltVT = MVT::getIntegerVT(NumEltBits); SDValue Zero = DAG.getConstant(0, DL, IntEltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, IntEltVT); if (EltVT.isFloatingPoint()) { Zero = DAG.getBitcast(EltVT, Zero); AllOnes = DAG.getBitcast(EltVT, AllOnes); } SmallVector VMaskOps(Mask.size(), Zero); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); V = DAG.getNode(VT.isFloatingPoint() ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, DL, VT, V, VMask); return V; } /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, EltVT); SmallVector MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); // We have to cast V2 around. MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, DAG.getBitcast(MaskVT, V1Mask), DAG.getBitcast(MaskVT, V2))); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, const X86Subtarget *Subtarget, SelectionDAG &DAG) { bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); SmallVector Mask(Original.begin(), Original.end()); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); bool ForceV1Zero = false, ForceV2Zero = false; // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; if (M < 0) continue; if (M == i) continue; if (M == i + Size) { BlendMask |= 1u << i; continue; } if (Zeroable[i]) { if (V1IsZero) { ForceV1Zero = true; Mask[i] = i; continue; } if (V2IsZero) { ForceV2Zero = true; BlendMask |= 1u << i; Mask[i] = i + Size; continue; } } return SDValue(); // Shuffled input! } // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { unsigned ScaledMask = 0; for (int i = 0; i != Size; ++i) if (BlendMask & (1u << i)) for (int j = 0; j != Scale; ++j) ScaledMask |= 1u << (i * Scale + j); return ScaledMask; }; switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: case MVT::v4f64: case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); // FALLTHROUGH case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into // that instruction. if (Subtarget->hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } // FALLTHROUGH case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } case MVT::v16i16: { assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 16) BlendMask |= 1u << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } } // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { assert((VT.is128BitVector() || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) return Masked; // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask), V1, V2)); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// \brief Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); SmallVector PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] == -1) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// \brief Generic routine to decompose a shuffle and blend into indepndent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector V1Mask(Mask.size(), -1); SmallVector V2Mask(Mask.size(), -1); SmallVector BlendMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && Mask[i] < Size) { V1Mask[i] = Mask[i]; BlendMask[i] = i; } else if (Mask[i] >= Size) { V2Mask[i] = Mask[i] - Size; BlendMask[i] = i + Size; } // Try to lower with the simpler initial blend strategy unless one of the // input shuffles would be a no-op. We prefer to shuffle inputs as the // shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, blending // first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } /// \brief Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int l = 0; l < NumElts; l += NumLaneElts) { for (int i = 0; i < NumLaneElts; ++i) { if (Mask[l + i] == -1) continue; assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); // Get the mod-Size index and lane correct it. int LaneIdx = (Mask[l + i] % NumElts) - l; // Make sure it was in this lane. if (LaneIdx < 0 || LaneIdx >= NumLaneElts) return SDValue(); // Determine where a rotated vector would have started. int StartIdx = i - LaneIdx; if (StartIdx == 0) // The identity rotation isn't interesting, stop. return SDValue(); // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return SDValue(); // Compute which value this mask is pointing at. SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return SDValue(); } } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; // The actual rotate instruction rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int Scale = 16 / NumLaneElts; // SSSE3 targets can use the palignr instruction. if (Subtarget->hasSSSE3()) { // Cast the inputs to i8 vector of correct length to match PALIGNR. MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); Lo = DAG.getBitcast(AlignVT, Lo); Hi = DAG.getBitcast(AlignVT, Hi); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); // Default SSE2 implementation int LoByteShift = 16 - Rotation * Scale; int HiByteShift = Rotation * Scale; // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. Lo = DAG.getBitcast(MVT::v2i64, Lo); Hi = DAG.getBitcast(MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, DAG.getConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + (V == V1 ? 0 : Size))) return SDValue(); } int ShiftEltBits = VT.getScalarSizeInBits() * Scale; bool ByteShift = ShiftEltBits > 64; unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) for (SDValue V : {V1, V2}) if (SDValue Match = MatchShift(Shift, Scale, Left, V)) return Match; // no match return SDValue(); } /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); assert(!Zeroable.all() && "Fully zeroable shuffle mask"); int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) return SDValue(); // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. auto LowerAsEXTRQ = [&]() { // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M < 0) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return SDValue(); if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return SDValue(); } if (Idx < 0) return SDValue(); assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); }; if (SDValue ExtrQ = LowerAsEXTRQ()) return ExtrQ; // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } auto LowerAsInsertQ = [&]() { for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } // We may not have a base (first source) - this can safely be undefined. if (!Base) Base = DAG.getUNDEF(VT); int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); } } return SDValue(); }; if (SDValue InsertQ = LowerAsInsertQ()) return InsertQ; return SDValue(); } /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offseted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { // Not worth offseting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || !SafeOffset(Offset + 1)) return DAG.getNode(ISD::BITCAST, DL, VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); PSHUFBMask[i] = DAG.getConstant( (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast(VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// \brief Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// \brief Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S); } return SDValue(); } /// \brief Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return ISD::isNON_EXTLoad(V.getNode()); } /// \brief Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); int V2Index = std::find_if(Mask.begin(), Mask.end(), [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::v4i32; V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply, and // the V1 elements can't be permuted in any way. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); SmallVector V1Mask(Mask.begin(), Mask.end()); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); // This is essentially a special case blend operation, but if we have // general purpose blend operations, they are always faster. Bail and let // the rest of the lowering handle these as blends. if (Subtarget->hasSSE41()) return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"); return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v2i64, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, DAG.getTargetLoweringInfo().getScalarShiftAmountTy( DAG.getDataLayout(), VT))); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// \brief Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX2() && "We can only lower integer broadcasts with AVX2!"); EVT EltVT = VT.getVectorElementType(); EVT V0VT = V0.getValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); EVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); if (VT.isInteger() && !Subtarget->hasAVX2()) return SDValue(); // Check that the mask is a broadcast. int BroadcastIdx = -1; for (int M : Mask) if (M >= 0 && BroadcastIdx == -1) BroadcastIdx = M; else if (M >= 0 && M != BroadcastIdx) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { int OperandSize = Mask.size() / V.getNumOperands(); V = V.getOperand(BroadcastIdx / OperandSize); BroadcastIdx %= OperandSize; continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast(V.getOperand(2)); if (!ConstantIdx) break; int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; } else { V = VOuter; } continue; } } break; } // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // First, look through bitcast: if the original value has a larger element // type than the shuffle, the broadcast element is in essence truncated. // Make that explicit to ease folding. if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; // Peek through any bitcast (only useful for loads). SDValue BC = V; while (BC.getOpcode() == ISD::BITCAST) BC = BC.getOperand(0); // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); // If the scalar isn't a load, we can't broadcast from it in AVX1. // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. LoadSDNode *Ld = cast(BC); SDValue BaseAddr = Ld->getOperand(1); EVT AddrVT = BaseAddr.getValueType(); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); SDValue NewAddr = DAG.getNode( ISD::ADD, DL, AddrVT, BaseAddr, DAG.getConstant(Offset, DL, AddrVT)); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); return DAG.getBitcast(VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); unsigned ZMask = 0; int V1DstIndex = -1; int V2DstIndex = -1; bool V1UsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any V1 inputs in place. if (i == Mask[i]) { V1UsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (V1DstIndex != -1 || V2DstIndex != -1) return SDValue(); if (Mask[i] < 4) { // V1 input out of place for insertion. V1DstIndex = i; } else { // V2 input for insertion. V2DstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (V1DstIndex == -1 && V2DstIndex == -1) return SDValue(); // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned V2SrcIndex = 0; if (V1DstIndex != -1) { // If we have a V1 input out of place, we use V1 as the V2 element insertion // and don't use the original V2 at all. V2SrcIndex = Mask[V1DstIndex]; V2DstIndex = V1DstIndex; V2 = V1; } else { V2SrcIndex = Mask[V2DstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!V1UsedInPlace) V1 = DAG.getUNDEF(MVT::v4f32); unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); // Insert the V2 element into the desired position. SDLoc DL(Op); return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } /// \brief Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(!isSingleInputShuffleMask(Mask) && "This routine should only be used when blending two inputs."); assert(Mask.size() >= 2 && "Single element masks are invalid."); int Size = Mask.size(); int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = std::count_if( Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](MVT UnpackVT, int Scale) { SmallVector V1Mask(Mask.size(), -1); SmallVector V2Mask(Mask.size(), -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigNumElements = VT.getVectorNumElements(); int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { int Scale = ScalarSize / OrigScalarSize; int NumElements = OrigNumElements / Scale; MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) return Unpack; } // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector PermMask; PermMask.assign(Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Use low duplicate instructions for masks that match their pattern. if (Subtarget->hasSSE3()) if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; } // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); // If we have a blend of two PACKUS operations an the blend aligns with the // low and half halves, we can just merge the PACKUS operations. This is // particularly important as it lets us merge shuffles that this routine itself // creates. auto GetPackNode = [](SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); }; if (SDValue V1Pack = GetPackNode(V1)) if (SDValue V2Pack = GetPackNode(V2)) return DAG.getBitcast(MVT::v2i64, DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Mask[0] == 0 ? V1Pack.getOperand(0) : V1Pack.getOperand(1), Mask[1] == 2 ? V2Pack.getOperand(0) : V2Pack.getOperand(1))); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget->hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// \brief Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] == -1) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// \brief Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget->hasSSE3()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return V; if (Subtarget->hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return ZExt; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget->hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Unpack; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. return DAG.getBitcast( MVT::v4i32, DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), DAG.getBitcast(MVT::v4f32, V2), Mask)); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputVectorShuffle( SDLoc DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); SmallVector LoInputs; std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); int NumFlippedBToBInputs = std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), PinnedIdx ^ 1) != Inputs.end(); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), FixFreeIdx) != Inputs.end(); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), FixFreeIdx) != Inputs.end(); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M != -1 && M == FixIdx) M = FixFreeIdx; else if (M != -1 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M != -1 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M != -1 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, MutableArrayRef FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = std::find(std::begin(SourceHalfMask), std::end(SourceHalfMask), -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] == -1) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] == -1) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } /// \brief Helper to form a PSHUFB-based shuffle+blend. static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SDValue V1Mask[16]; SDValue V2Mask[16]; V1InUse = false; V2InUse = false; int Size = Mask.size(); int Scale = 16 / Size; for (int i = 0; i < 16; ++i) { if (Mask[i / Scale] == -1) { V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { const int ZeroMask = 0x80; int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale : ZeroMask; int V2Idx = Mask[i / Scale] < Size ? ZeroMask : (Mask[i / Scale] - Size) * Scale + i % Scale; if (Zeroable[i / Scale]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } } if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V1), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V2), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; if (V1InUse && V2InUse) V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); else V = V1InUse ? V1 : V2; // Cast the result back to the correct type. return DAG.getBitcast(VT, V); } /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef OrigMask = SVOp->getMask(); int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; MutableArrayRef Mask(MaskStorage); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) return ZExt; auto isV1 = [](int M) { return M >= 0 && M < 8; }; (void)isV1; auto isV2 = [](int M) { return M >= 8; }; int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, Subtarget, DAG)) return Rotate; return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, Subtarget, DAG); } assert(std::any_of(Mask.begin(), Mask.end(), isV1) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles."); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget->hasSSE4A()) if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget->hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue BitBlend = lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget->hasSSSE3()) { bool V1InUse, V2InUse; return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, Mask, DAG); } /// \brief Check whether a compaction lowering can be done by dropping even /// elements and compute how many times even elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even elements must be dropped if /// there is such a number. Otherwise returns zero. static int canLowerByDroppingEvenElements(ArrayRef Mask) { // Figure out whether we're looping over two inputs or just one. bool IsSingleInput = isSingleInputShuffleMask(Mask); // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] == -1) continue; bool IsAnyViable = false; for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget->hasSSE4A()) if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) return V; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, Mask, Subtarget, DAG)) return Broadcast; // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { for (int i = 0; i < 16; i += 2) if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) return false; return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector LoInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] != -1) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] != -1) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); if (PostDupI16Shuffle[i / 2] == -1) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input // lowerings can find an instruction sequence that is faster than a PSHUFB, we // want to preserve that and we can DAG combine any longer sequences into // a PSHUFB in the end. But once we start blending from multiple inputs, // the complexity of DAG combining bad patterns back into PSHUFB is too high, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget->hasSSSE3()) { bool V1InUse = false; bool V2InUse = false; SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some // cases. Even though the or may be (very minorly) more efficient, we // preference this lowering because there are common cases where part of // the complexity of the shuffles goes away when we do the final blend as // an unpack. // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; } return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue BitBlend = lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) return BitBlend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. // // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. bool IsSingleInput = isSingleInputShuffleMask(Mask); // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); // We use the mask type to pick which bytes are preserved based on how many // elements are dropped. MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; SDValue ByteClearMask = DAG.getBitcast( MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); // Now pack things back together. V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } return Result; } // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together // with a pack. SDValue V = V1; int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); SDValue VLoHalf, VHiHalf; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), [](int M) { return M >= 0 && M % 2 == 1; }) && std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, DAG.getConstant(0x00FF, DL, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke VHiHalf. VHiHalf = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into VLoHalf. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. VLoHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); VHiHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// \brief Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { for (int i = 0, Size = Mask.size(); i < Size; i += 2) { // If both elements are undef, its trivial. if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { WidenedMask.push_back(SM_SentinelUndef); continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { WidenedMask.push_back(Mask[i + 1] / 2); continue; } if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { WidenedMask.push_back(Mask[i] / 2); continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { WidenedMask.push_back(SM_SentinelZero); continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { WidenedMask.push_back(Mask[i] / 2); continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } /// \brief Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build // vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V->getOperand(0); MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; auto *BV = dyn_cast(V); if (!BV) { LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(0, DL)); HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(OrigSplitNumElements, DL)); } else { SmallVector LoOps, HiOps; for (int i = 0; i < OrigSplitNumElements; ++i) { LoOps.push_back(BV->getOperand(i)); HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); } LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); } return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); }; SDValue LoV1, HiV1, LoV2, HiV2; std::tie(LoV1, HiV1) = SplitVector(V1); std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; SmallVector V1BlendMask, V2BlendMask, BlendMask; for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { if (M >= NumElements + SplitNumElements) UseHiV2 = true; else UseLoV2 = true; V2BlendMask.push_back(M - NumElements); V1BlendMask.push_back(-1); BlendMask.push_back(SplitNumElements + i); } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; V2BlendMask.push_back(-1); V1BlendMask.push_back(M); BlendMask.push_back(i); } else { V2BlendMask.push_back(-1); V1BlendMask.push_back(-1); BlendMask.push_back(-1); } } // Because the lowering happens after all combining takes place, we need to // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); if (!UseLoV2 && !UseHiV2) return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); if (!UseLoV1 && !UseHiV1) return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); SDValue V1Blend, V2Blend; if (UseLoV1 && UseHiV1) { V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); } else { // We only use half of V1 so map the usage down into the final blend mask. V1Blend = UseLoV1 ? LoV1 : HiV1; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); } if (UseLoV2 && UseHiV2) { V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); } else { // We only use half of V2 so map the usage down into the final blend mask. V2Blend = UseLoV2 ? LoV2 : HiV2; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= SplitNumElements) BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); } return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } /// \brief Either split a vector in halves or decompose the shuffles and the /// blend. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " "lower single-input shuffles as it " "could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, // prefer that lowering. This is especially important because broadcasts can // often fold with memory operands. auto DoBothBroadcast = [&] { int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { if (V2BroadcastIdx == -1) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { if (V1BroadcastIdx == -1) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; } return true; }; if (DoBothBroadcast()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to // unusually few instructions. int LaneCount = VT.getSizeInBits() / 128; int LaneSize = Size / LaneCount; SmallBitVector LaneInputs[2]; LaneInputs[0].resize(LaneCount, false); LaneInputs[1].resize(LaneCount, false); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// /// This essentially blends the out-of-lane inputs to each lane into the lane /// from a permuted copy of the vector. This lowering strategy results in four /// instructions in the worst case for a single-input cross lane shuffle which /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. bool LaneCrossing[2] = {false, false}; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); if (isSingleInputShuffleMask(Mask)) { SmallVector FlippedBlendMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) FlippedBlendMask.push_back( Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) ? Mask[i] : Mask[i] % LaneSize + (i / LaneSize) * LaneSize + Size)); // Flip the vector, and blend the results which should now be in-lane. The // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and // 5 for the high source. The value 3 selects the high half of source 2 and // the value 2 selects the low half of source 2. We only use source 2 to // allow folding it into a memory operand. unsigned PERMMask = 3 | 2 << 4; SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), V1, DAG.getConstant(PERMMask, DL, MVT::i8)); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } // This now reduces to two single-input shuffles of V1 and V2 which at worst // will be handled by the above logic and a blend of the results, much like // other patterns in AVX. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Blend; bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. if (!IsV1Zero && !IsV2Zero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } // Otherwise form a 128-bit permutation. After accounting for undefs, // convert the 64-bit shuffle mask selection values into 128-bit // selection bits by dividing the indexes by 2 and shifting into positions // defined by a vperm2*128 instruction's immediate control byte. // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination // [2] - ignore // [3] - zero low half of destination // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination int MaskLO = Mask[0]; if (MaskLO == SM_SentinelUndef) MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; int MaskHI = Mask[2]; if (MaskHI == SM_SentinelUndef) MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; // If either input is a zero vector, replace it with an undef input. // Shuffle mask values < 4 are selecting elements of V1. // Shuffle mask values >= 4 are selecting elements of V2. // Adjust each half of the permute mask by clearing the half that was // selecting the zero vector and setting the zero mask bit. if (IsV1Zero) { V1 = DAG.getUNDEF(VT); if (MaskLO < 4) PermMask = (PermMask & 0xf0) | 0x08; if (MaskHI < 4) PermMask = (PermMask & 0x0f) | 0x80; } if (IsV2Zero) { V2 = DAG.getUNDEF(VT); if (MaskLO >= 4) PermMask = (PermMask & 0xf0) | 0x08; if (MaskHI >= 4) PermMask = (PermMask & 0x0f) | 0x80; } return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); } /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// /// This will only succeed when the result of fixing the 128-bit lanes results /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in /// each 128-bit lanes. This handles many cases where we can quickly blend away /// the lane crosses early and then use simpler shuffles within each lane. /// /// FIXME: It might be worthwhile at some point to support this without /// requiring the 128-bit lane-relative shuffles to be repeating, but currently /// in x86 only floating point has interesting non-repeating shuffles, and even /// those are still *marginally* more expensive. static SDValue lowerVectorShuffleByMerging128BitLanes( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(!isSingleInputShuffleMask(Mask) && "This is only useful with multiple inputs."); int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); int NumLanes = Size / LaneSize; assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also // check whether the in-128-bit lane shuffles share a repeating pattern. SmallVector Lanes; Lanes.resize(NumLanes, -1); SmallVector InLaneMask; InLaneMask.resize(LaneSize, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; int j = i / LaneSize; if (Lanes[j] < 0) { // First entry we've seen for this lane. Lanes[j] = Mask[i] / LaneSize; } else if (Lanes[j] != Mask[i] / LaneSize) { // This doesn't match the lane selected previously! return SDValue(); } // Check that within each lane we have a consistent shuffle mask. int k = i % LaneSize; if (InLaneMask[k] < 0) { InLaneMask[k] = Mask[i] % LaneSize; } else if (InLaneMask[k] != Mask[i] % LaneSize) { // This doesn't fit a repeating in-lane mask. return SDValue(); } } // First shuffle the lanes into place. MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, VT.getSizeInBits() / 64); SmallVector LaneMask; LaneMask.resize(NumLanes * 2, -1); for (int i = 0; i < NumLanes; ++i) if (Lanes[i] >= 0) { LaneMask[2 * i + 0] = 2*Lanes[i] + 0; LaneMask[2 * i + 1] = 2*Lanes[i] + 1; } V1 = DAG.getBitcast(LaneVT, V1); V2 = DAG.getBitcast(LaneVT, V2); SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); // Cast it back to the type we actually want. LaneShuffle = DAG.getBitcast(VT, LaneShuffle); // Now do a simple shuffle that isn't lane crossing. SmallVector NewMask; NewMask.resize(Size, -1); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"); return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } /// Lower shuffles where an entire half of a 256-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); if (!UndefLower && !UndefUpper) return SDValue(); // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> if (UndefUpper && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(HalfNumElts, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(0, DL)); } // Lower half is undef and upper half is whole lower subvector. // e.g. vector_shuffle or if (UndefLower && isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(HalfNumElts, DL)); } // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. if (UndefLower && Subtarget->hasAVX2() && (VT == MVT::v4f64 || VT == MVT::v4i64)) return SDValue(); // If the shuffle only uses the lower halves of the input operands, // then extract them and perform the 'half' shuffle at half width. // e.g. vector_shuffle or int HalfIdx1 = -1, HalfIdx2 = -1; SmallVector HalfMask; unsigned Offset = UndefLower ? HalfNumElts : 0; for (unsigned i = 0; i != HalfNumElts; ++i) { int M = Mask[i + Offset]; if (M < 0) { HalfMask.push_back(M); continue; } // Determine which of the 4 half vectors this element is from. // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. int HalfIdx = M / HalfNumElts; // Only shuffle using the lower halves of the inputs. // TODO: Investigate usefulness of shuffling with upper halves. if (HalfIdx != 0 && HalfIdx != 2) return SDValue(); // Determine the element index into its half vector source. int HalfElt = M % HalfNumElts; // We can shuffle with up to 2 half vectors, set the new 'half' // shuffle mask accordingly. if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { HalfMask.push_back(HalfElt); HalfIdx1 = HalfIdx; continue; } if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { HalfMask.push_back(HalfElt + HalfNumElts); HalfIdx2 = HalfIdx; continue; } // Too many half vectors referenced. return SDValue(); } assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); auto GetHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); HalfIdx = (HalfIdx % 2) * HalfNumElts; return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, DAG.getIntPtrConstant(HalfIdx, DL)); }; SDValue Half1 = GetHalfVector(HalfIdx1); SDValue Half2 = GetHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD"); int NumElts = VT.getVectorNumElements(); bool ShufpdMask = true; bool CommutableMask = true; unsigned Immediate = 0; for (int i = 0; i < NumElts; ++i) { if (Mask[i] < 0) continue; int Val = (i & 6) + NumElts * (i & 1); int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1); if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) CommutableMask = false; Immediate |= (Mask[i] % 2) << i; } if (ShufpdMask) return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, DAG.getConstant(Immediate, DL, MVT::i8)); if (CommutableMask) return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, DAG.getConstant(Immediate, DL, MVT::i8)); return SDValue(); } /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SmallVector WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowerid with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget->hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget->hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); SmallVector WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, Mask, Subtarget, DAG)) return Broadcast; // When the shuffle is mirrored between the 128-bit lanes of the unit, we can // use lower latency instructions that will operate on both 128-bit lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { if (isSingleInputShuffleMask(Mask)) { int PSHUFDMask[] = {-1, -1, -1, -1}; for (int i = 0; i < 2; ++i) if (RepeatedMask[i] >= 0) { PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; } return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } } // AVX2 provides a direct instruction for permuting a single input across // lanes. if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the // repeated mask into a simulated v4f32 mask. for (int i = 0; i < 4; ++i) if (RepeatedMask[i] >= 8) RepeatedMask[i] -= 4; return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (isSingleInputShuffleMask(Mask)) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(Mask[i], DL, MVT::i32); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return DAG.getNode( X86ISD::VPERMILPV, DL, MVT::v8f32, V1, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); if (Subtarget->hasAVX2()) return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget->hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return ZExt; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) return Shift; if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (isSingleInputShuffleMask(Mask)) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(Mask[i], DL, MVT::i32); return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8i32, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. return lowerV8I16GeneralSingleInputVectorShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); continue; } int M = i < 8 ? Mask[i] : Mask[i] - 8; assert(M >= 0 && M < 8 && "Invalid single-input mask!"); PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); } return DAG.getBitcast(MVT::v16i16, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, DAG.getBitcast(MVT::v32i8, V1), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i8 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); SDValue PSHUFBMask[32]; for (int i = 0; i < 32; ++i) PSHUFBMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL, MVT::i8); return DAG.getNode( X86ISD::PSHUFB, DL, MVT::v32i8, V1, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have // essentially *zero* ability to manipulate a 256-bit vector with integer // types. Since we'll use floating point types there eventually, just // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget->hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) // No floating point type available, decompose into 128-bit vectors. return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); V1 = DAG.getBitcast(FpVT, V1); V2 = DAG.getBitcast(FpVT, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4i64: return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8f32: return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i32: return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i16: return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i8: return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } /// \brief Try to lower a vector shuffle as a 128-bit shuffles. static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); // Form a 128-bit permutation. // Convert the 64-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. unsigned PermMask = 0, Imm = 0; unsigned ControlBitsNum = WidenedMask.size() / 2; for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { if (WidenedMask[i] == SM_SentinelZero) return SDValue(); // Use first element in place of undef mask. Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); } return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); } static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); } /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Shuf128 = lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Shuf128; if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Shuf128 = lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Shuf128; if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return Unpck; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 512-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have supprot for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16f32: return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i32: return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i16: if (Subtarget->hasBWI()) return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); break; case MVT::v64i8: if (Subtarget->hasBWI()) return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); break; default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } // Otherwise fall back on splitting. return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); MVT ExtVT; switch (VT.SimpleTy) { default: llvm_unreachable("Expected a vector of i1 elements"); case MVT::v2i1: ExtVT = MVT::v2i64; break; case MVT::v4i1: ExtVT = MVT::v4i32; break; case MVT::v8i1: ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL break; case MVT::v16i1: ExtVT = MVT::v16i32; break; case MVT::v32i1: ExtVT = MVT::v32i16; break; case MVT::v64i1: ExtVT = MVT::v64i8; break; } if (ISD::isBuildVectorAllZeros(V1.getNode())) V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V1.getNode())) V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); else V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); if (V2.isUndef()) V2 = DAG.getUNDEF(ExtVT); else if (ISD::isBuildVectorAllZeros(V2.getNode())) V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V2.getNode())) V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); else V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); } /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); assert((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef) for (int M : Mask) if (M >= NumElements) { SmallVector NewMask(Mask.begin(), Mask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); if (Zeroable.all()) return getZeroVector(VT, Subtarget, DAG, dl); // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; for (int M : SVOp->getMask()) if (M < 0) ++NumUndefElements; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum // indices for V2. When those are equal, try to ensure that the number of odd // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) { return DAG.getCommutedVectorShuffle(*SVOp); } else if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) if (SVOp->getMask()[i] >= NumElements) SumV2Indices += i; else if (SVOp->getMask()[i] >= 0) SumV1Indices += i; if (SumV2Indices < SumV1Indices) { return DAG.getCommutedVectorShuffle(*SVOp); } else if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) if (SVOp->getMask()[i] >= NumElements) NumV2OddIndices += i % 2; else if (SVOp->getMask()[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return DAG.getCommutedVectorShuffle(*SVOp); } } } // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); if (VT.is256BitVector()) return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); if (VT.is512BitVector()) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); if (Is1BitVector) return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } // This function assumes its argument is a BUILD_VECTOR of constants or // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is // true. static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. // We don't handle the >2 lanes case right now. unsigned NumLanes = (NumElems - 1) / 8 + 1; if (NumLanes > 2) return false; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symmetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { SDValue EltCond = BuildVector->getOperand(i); SDValue SndLaneEltCond = (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; int Lane1Cond = -1, Lane2Cond = -1; if (isa(EltCond)) Lane1Cond = !isNullConstant(EltCond); if (isa(SndLaneEltCond)) Lane2Cond = !isNullConstant(SndLaneEltCond); unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) // Lane1Cond != 0, means we want the first argument. // Lane1Cond == 0, means we want the second argument. // The encoding of this argument is 0 for the first argument, 1 // for the second. Therefore, invert the condition. LaneMask = !Lane1Cond << i; else if (Lane1Cond < 0) LaneMask = !Lane2Cond << i; else return false; MaskValue |= LaneMask; if (NumLanes == 2) MaskValue |= LaneMask << NumElemsInLane; } return true; } /// \brief Try to lower a VSELECT instruction to a vector shuffle. static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); auto *CondBV = cast(Cond); // Only non-legal VSELECTs reach this lowering, convert those into generic // shuffles and re-use the shuffle lowering path for blends. SmallVector Mask; for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( isa(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // Variable blends are only legal from SSE4.1 onward. if (!Subtarget->hasSSE41()) return SDValue(); // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. switch (Op.getSimpleValueType().SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; case MVT::v32i8: // The byte blends for AVX vectors were introduced only in AVX2. if (Subtarget->hasAVX2()) return Op; return SDValue(); case MVT::v8i16: case MVT::v16i16: // AVX-512 BWI and VLX features support VSELECT with i16 elements. if (Subtarget->hasBWI() && Subtarget->hasVLX()) return Op; // FIXME: We should custom lower this by fixing the condition and using i8 // blends. return SDValue(); } } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT.getSizeInBits() == 16) { // If Idx is 0, it's cheaper to do a move instead of a pextrw. if (isNullConstant(Op.getOperand(1))) return DAG.getNode( ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), Op.getOperand(1))); SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); return DAG.getBitcast(MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa(Op.getOperand(1))) return Op; } return SDValue(); } /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. SDValue X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512 if (!isa(Idx)) { MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } unsigned IdxVal = cast(Idx)->getZExtValue(); const TargetRegisterClass* rc = getRegClassFor(VecVT); if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) rc = getRegClassFor(MVT::v16i1); unsigned MaxSift = rc->getSize()*8 - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, DAG.getIntPtrConstant(0, dl)); } SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); if (Op.getSimpleValueType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { if (VecVT.is512BitVector() || (VecVT.is256BitVector() && Subtarget->hasInt256() && VecVT.getVectorElementType().getSizeInBits() == 32)) { MVT MaskEltVT = MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / MaskEltVT.getSizeInBits()); Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, DAG.getConstant(0, dl, PtrVT)); SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, DAG.getConstant(0, dl, PtrVT)); } return SDValue(); } // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { unsigned IdxVal = cast(Idx)->getZExtValue(); // Get the 128-bit vector. Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, dl, MVT::i32)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); if (Subtarget->hasSSE41()) if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); if (isNullConstant(Op.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. MVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT.getSizeInBits() == 32) { unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); if (Idx == 0) return Op; // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast(Idx), -1, -1, -1 }; MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. if (isNullConstant(Op.getOperand(1))) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } return SDValue(); } /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. SDValue X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT VecVT = Vec.getSimpleValueType(); if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } unsigned IdxVal = cast(Idx)->getZExtValue(); SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); if (IdxVal) EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); if (Vec.getOpcode() == ISD::UNDEF) return EltInVec; return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); if (!isa(N2)) return SDValue(); auto *N2C = cast(N2); unsigned IdxVal = N2C->getZExtValue(); // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { // With a 256-bit vector, we can insert into the zero element efficiently // using a blend if we have AVX or AVX2 and the right data type. if (VT.is256BitVector() && IdxVal == 0) { // TODO: It is worthwhile to cast integer to floating point and back // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget->hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); N2 = DAG.getIntPtrConstant(1, dl); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); } } // Get the desired 128-bit vector chunk. SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(NumEltsIn128)); // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, dl, MVT::i32)); // Insert the changed part back into the bigger vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); if (Subtarget->hasSSE41()) { if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { unsigned Opc; if (VT == MVT::v8i16) { Opc = X86ISD::PINSRW; } else { assert(VT == MVT::v16i8); Opc = X86ISD::PINSRB; } // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these bits. For example (insert (extract, 3), 2) could be matched by // putting the '3' into bits [7:6] of X86ISD::INSERTPS. // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so // will always have equal or better performance than insertps. // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. N2 = DAG.getIntPtrConstant(1, dl); N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); } N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } if (EltVT == MVT::i32 || EltVT == MVT::i64) { // PINSR* works with constant index. return Op; } } if (EltVT == MVT::i8) return SDValue(); if (EltVT.getSizeInBits() == 16) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits()/128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } if (OpVT == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in // a simple subregister reference or explicit instructions to grab // upper bits of a vector. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); SDValue In = Op.getOperand(0); SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast(Idx)->getZExtValue(); MVT ResVT = Op.getSimpleValueType(); MVT InVT = In.getSimpleValueType(); if (Subtarget->hasFp256()) { if (ResVT.is128BitVector() && (InVT.is256BitVector() || InVT.is512BitVector()) && isa(Idx)) { return Extract128BitVector(In, IdxVal, DAG, dl); } if (ResVT.is256BitVector() && InVT.is512BitVector() && isa(Idx)) { return Extract256BitVector(In, IdxVal, DAG, dl); } } return SDValue(); } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (!isa(Idx)) return SDValue(); unsigned IdxVal = cast(Idx)->getZExtValue(); MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); // Fold two 16-byte subvector loads into one 32-byte load: // (insert_subvector (insert_subvector undef, (load addr), 0), // (load addr + 16), Elts/2) // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && OpVT.is256BitVector() && SubVecVT.is128BitVector()) { auto *Idx2 = dyn_cast(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { SDValue SubVec2 = Vec.getOperand(1); // If needed, look through a bitcast to get to the load. if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) SubVec2 = SubVec2.getOperand(0); if (auto *FirstLd = dyn_cast(SubVec2)) { bool Fast; unsigned Alignment = FirstLd->getAlignment(); unsigned AS = FirstLd->getAddressSpace(); const X86TargetLowering *TLI = Subtarget->getTargetLowering(); if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), OpVT, AS, Alignment, &Fast) && Fast) { SDValue Ops[] = { SubVec2, SubVec }; if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) return Ld; } } } } if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && SubVecVT.is128BitVector()) return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); if (OpVT.getVectorElementType() == MVT::i1) return Insert1BitVector(Op, DAG); return SDValue(); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; else if (Subtarget->isPICStyleGOT()) OpFlag = X86II::MO_GOTOFF; else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; else if (Subtarget->isPICStyleGOT()) OpFlag = X86II::MO_GOTOFF; else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { const char *Sym = cast(Op)->getSymbol(); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) { if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) OpFlag = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; } else if (Subtarget->isPICStyleGOT()) { OpFlag = X86II::MO_GOT; } else if (Subtarget->isPICStyleStubPIC()) { OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; } else if (Subtarget->isPICStyleStubNoDynamic()) { OpFlag = X86II::MO_DARWIN_NONLAZY; } auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); SDLoc DL(Op); Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } // For symbols that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget->ClassifyBlockAddressReference(); CodeModel::Model M = DAG.getTarget().getCodeModel(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { // A direct static reference to a global. Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); Offset = 0; } else { Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); } if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, DAG.getConstant(Offset, dl, PtrVT)); return Result; } SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast(Op)->getGlobal(); int64_t Offset = cast(Op)->getOffset(); return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); } static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR : X86ISD::TLSADDR; if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI->setAdjustsStack(true); MFI->setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo* MFI = DAG.getMachineFunction() .getInfo(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (is64Bit) { Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations // of Base. // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); } // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr), false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is // initialexec. unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; } else if (model == TLSModel::InitialExec) { if (is64Bit) { OperandFlags = X86II::MO_GOTTPOFF; WrapperKind = X86ISD::WrapperRIP; } else { OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; } } else { llvm_unreachable("Unexpected model"); } // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); // Cygwin uses emutls. // FIXME: It may be EmulatedTLS-generic also for X86-Android. if (Subtarget->isTargetWindowsCygwin()) return LowerToTLSEmulatedModel(GA, DAG); const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Subtarget->isTargetELF()) { if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget->is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(), DAG.getTarget().getRelocationModel() == Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } if (Subtarget->isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && !Subtarget->is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget->isTargetKnownWindowsMSVC() || Subtarget->isTargetWindowsGNU()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) // mov rcx, qword [rdx+rcx*8] // mov eax, .tls$:tlsvar // [rax+rcx] contains the address // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58, dl) : (Subtarget->isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, false, false, 0); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget->is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32, false, false, false, 0); else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, false, false, 0); auto &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, false, 0); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); } /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, dl, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i8)) : DAG.getConstant(0, dl, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } // If the shift amount is larger or equal than the width of a part we can't // rely on the results of shld/shrd. Insert a test and select the appropriate // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, dl, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, AndNode, DAG.getConstant(0, dl, MVT::i8)); SDValue Hi, Lo; SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } else { Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } if (SrcVT.getVectorElementType() == MVT::i1) { MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); } return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); // These are really Legal; return the operand so the caller accepts it as // Legal. if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) return Op; if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && Subtarget->is64Bit()) { return Op; } SDValue ValueToStore = Op.getOperand(0); if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); unsigned ByteSize = SrcVT.getSizeInBits()/8; FrameIndexSDNode *FI = dyn_cast(StackSlot); MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, Ops, SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is flagged to the FILD_FLAG. This // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueType().getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, false, 0); } return Result; } // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ haddpd %xmm0, %xmm0 #else pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); SmallVector CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, APInt(64, 0x4330000000000000ULL)))); CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget->hasSSE3()) { // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, S2F, 0x4E, DAG); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, DAG.getBitcast(MVT::v2f64, Shuffle), Sub); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); } // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(0)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Load), DAG.getIntPtrConstant(0, dl)); // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. MVT DestVT = Op.getSimpleValueType(); if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, DAG.getIntPtrConstant(0, dl)); if (DestVT.bitsGT(MVT::f64)) return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); // Handle final rounding. return Sub; } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); // #else // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; // uint4 hi = (v >> 16) | (uint4) 0x53000000; // #endif // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; // We shouldn't use it when unsafe-fp-math is enabled though: we might later // reassociate the two FADDs, and if we do that, the algorithm fails // spectacularly (PR24512). // FIXME: If we ever have some kind of Machine FMF, this should be marked // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because // there's also the MachineCombiner reassociations happening on Machine IR. if (DAG.getTarget().Options.UnsafeFPMath) return SDValue(); SDLoc DL(Op); SDValue V = Op->getOperand(0); MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); unsigned NumElts = VecIntVT.getVectorNumElements(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); assert(NumElts <= 8 && "The size of the constant array must be fixed"); // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 // -- 0x53000000 // - A shift: // -- v >> 16 // Create the splat vector for 0x4b000000. SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32); SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, CstLow, CstLow, CstLow, CstLow}; SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstLowArray[0], NumElts)); // Create the splat vector for 0x53000000. SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32); SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh}; SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstHighArray[0], NumElts)); // Create the right shift. SDValue CstShift = DAG.getConstant(16, DL, MVT::i32); SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, CstShift, CstShift, CstShift, CstShift}; SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstShiftArray[0], NumElts)); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; if (Subtarget.hasSSE41()) { MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); } else { SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32); SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, CstMask, CstMask, CstMask); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); // uint4 hi = (v >> 16) | (uint4) 0x53000000; High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). SDValue CstFAdd = DAG.getConstantFP( APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32); SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd}; SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, makeArrayRef(&CstFAddArray[0], NumElts)); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); switch (SVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v4i8: case MVT::v4i16: case MVT::v8i8: case MVT::v8i16: { MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); case MVT::v16i8: case MVT::v16i16: assert(Subtarget->hasAVX512()); return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. return Op; } if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue WordOff = DAG.getConstant(4, dl, PtrVT); SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo(), false, false, 0); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo(), false, false, 0); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); return Fild; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue ValueToStore = Op.getOperand(0); if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast(StackSlot)->getIndex(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), // just return an pair. // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 // to i16, i32 or i64, and we lower it to a legal sequence. // If lowered to the final integer result we return a pair. // Otherwise we lower it to a sequence ending with a FIST, return a // pair, and the caller is responsible for loading // the final integer result from StackSlot. std::pair X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); EVT TheVT = Op.getOperand(0).getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. return std::make_pair(SDValue(), SDValue()); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64 && (!Subtarget->is64Bit() || !isScalarFPTypeInSSEReg(TheVT)); if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // These are really Legal. if (DstTy == MVT::i32 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); if (Subtarget->is64Bit() && DstTy == MVT::i64 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; switch (DstTy.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { // // Conversion to unsigned i64 is implemented with a select, // depending on whether the source value fits in the range // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. // // Being a power of 2, Thresh is exactly representable in all FP formats. // For X87 we'd like to use the smallest FP type for this constant, but // for DAG type consistency we have to match the FP operand type. APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) Status = Thresh.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &LosesInfo); assert(Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"); SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); SDValue Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); Adjust = DAG.getSelect(DL, MVT::i32, Cmp, DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(0x80000000, DL, MVT::i32)); SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); } // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MachinePointerInfo::getFixedStack(MF, SSFI), false, false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOStore, MemSize, MemSize); if (UnsignedFixup) { // Insert the FIST, load its result as two i32's, // and XOR the high i32 with Adjust. SDValue FistOps[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), FistOps, DstTy, MMO); SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, DAG.getConstant(4, DL, PtrVT)); SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo(), false, false, false, 0); High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); if (Subtarget->is64Bit()) { // Join High32 and Low32 into a 64-bit result. // (High32 << 32) | Low32 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, DAG.getConstant(32, DL, MVT::i8)); SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); return std::make_pair(Result, SDValue()); } SDValue ResultOps[] = { Low32, High32 }; SDValue pair = IsReplace ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) : DAG.getMergeValues(ResultOps, DL); return std::make_pair(pair, SDValue()); } else { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); } } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); // Optimize vectors in AVX mode: // // v8i16 -> v8i32 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. // Concat upper and lower parts. // // v4i32 -> v4i64 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) return SDValue(); if (Subtarget->hasInt256()) return DAG.getNode(X86ISD::VZEXT, dl, VT, In); SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getBitcast(HVT, OpLo); OpHi = DAG.getBitcast(HVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned int NumElts = VT.getVectorNumElements(); if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); assert(InVT.getVectorElementType() == MVT::i1); MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; SDValue One = DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); if (VT.is512BitVector()) return V; return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (Subtarget->hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; return SDValue(); } static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); if (Subtarget->hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; assert(!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()); return SDValue(); } static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); // Shift LSB to MSB and use VPMOVB2M - SKX. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M ((InVT.is256BitVector() || InVT.is128BitVector()) && InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M // Shift packed bytes not supported natively, bitcast to dword MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); ShiftNode = DAG.getBitcast(InVT, ShiftNode); return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); } if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M ((InVT.is256BitVector() || InVT.is128BitVector()) && InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); } // Shift LSB to MSB, extend if necessary and use TESTM. unsigned NumElts = InVT.getVectorNumElements(); if (InVT.getSizeInBits() < 512 && (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || !Subtarget->hasVLX())) { assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); // TESTD/Q should be used (if BW supported we use CVT2MASK above), // so vector should be extended to packed dword/qword. MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; } SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); if (VT == MVT::i1) { assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && "Invalid scalar TRUNCATE operation"); if (InVT.getSizeInBits() >= 32) return SDValue(); In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); return DAG.getNode(ISD::TRUNCATE, DL, VT, In); } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget->hasAVX512()) { // word to byte only under BWI if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2, DL)); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomed PSHUFB. if (Subtarget->hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); SmallVector pshufbMask; for (unsigned i = 0; i < 2; ++i) { pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); for (unsigned j = 0; j < 8; ++j) pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), &ShufMask[0]); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(4, DL)); OpLo = DAG.getBitcast(MVT::v16i8, OpLo); OpHi = DAG.getBitcast(MVT::v16i8, OpHi); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; SDValue Undef = DAG.getUNDEF(MVT::v16i8); OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); // The MOVLHPS Mask: static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); return DAG.getBitcast(MVT::v8i16, res); } // Handle truncation of V256 to V128 using shuffles. if (!VT.is128BitVector() || !InVT.is256BitVector()) return SDValue(); assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); SmallVector MaskVec(NumElems * 2, -1); // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), DAG.getUNDEF(NVT), &MaskVec[0]); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { assert(!Op.getSimpleValueType().isVector()); std::pair Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); // The node is the result. return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { std::pair Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); // The node is the result. return FIST; } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT))); } /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); // If this is a FABS and it has an FNEG user, bail out to fold the combination // into an FNABS. We'll lower the FABS after that if it is still in use. if (IsFABS) for (SDNode *User : Op->uses()) if (User->getOpcode() == ISD::FNEG) return Op; SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. MVT LogicVT; MVT EltVT; unsigned NumElts; if (VT.isVector()) { LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } else if (IsF128) { // SSE instructions are used for optimized f128 logical operations. LogicVT = MVT::f128; EltVT = VT; NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. // Using a 16-byte mask allows folding the load of the mask with // the logic op, so it can save (~4 bytes) on code size. LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; EltVT = VT; NumElts = (VT == MVT::f64) ? 2 : 4; } unsigned EltBits = EltVT.getSizeInBits(); LLVMContext *Context = DAG.getContext(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, Alignment); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); unsigned LogicOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, // and extract the scalar result back out. Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Context = DAG.getContext(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); SrcVT = VT; } // And if it is bigger, shrink it first. if (SrcVT.bitsGT(VT)) { Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl)); SrcVT = VT; } // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = VT == MVT::f64 ? APFloat::IEEEdouble : (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector CV( VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). CV[0] = ConstantFP::get(*Context, APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); if (!IsF128) Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). // If it's a constant, we can clear it here. if (ConstantFPSDNode *Op0CN = dyn_cast(Op0)) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) return IsF128 ? SignBit : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { CV[0] = ConstantFP::get( *Context, APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, PtrVT, 16); SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa(Op0)) { if (!IsF128) Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); return IsF128 ? Val : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, DAG.getConstant(1, dl, VT)); return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT)); } // Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget->hasSSE41()) return SDValue(); if (!Op->hasOneUse()) return SDValue(); SDNode *N = Op.getNode(); SDLoc DL(N); SmallVector Opnds; DenseMap VecInMap; SmallVector VecIns; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. Opnds.push_back(N->getOperand(0)); Opnds.push_back(N->getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; // BFS traverse all OR'd operands. if (I->getOpcode() == ISD::OR) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. e += 2; // 2 more nodes (LHS and RHS) are pushed. continue; } // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa(Idx)) return SDValue(); SDValue ExtractedFromVec = I->getOperand(0); DenseMap::iterator M = VecInMap.find(ExtractedFromVec); if (M == VecInMap.end()) { VT = ExtractedFromVec.getValueType(); // Quit if not 128/256-bit vector. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); // Quit if not the same type. if (VecInMap.begin() != VecInMap.end() && VT != VecInMap.begin()->first.getValueType()) return SDValue(); M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; VecIns.push_back(ExtractedFromVec); } M->second |= 1U << cast(Idx)->getZExtValue(); } assert((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."); unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; for (DenseMap::const_iterator I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { // Quit if not all elements are used. if (I->second != FullMask) return SDValue(); } MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); // If more than one full vectors are evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. SDValue LHS = VecIns[Slot]; SDValue RHS = VecIns[Slot + 1]; VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// \brief return true if \c Op has a use that doesn't just read flags. static bool hasNonFlagsUse(SDValue Op) { for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned UOpNo = UI.getOperandNo(); if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { // Look pass truncate. UOpNo = User->use_begin().getOperandNo(); User = *User->use_begin(); } if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) return true; } return false; } /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) { SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, DAG.getConstant(0, dl, MVT::i8)); } // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; bool NeedOF = false; switch (X86CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. switch (Op->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SHL: { const auto *BinNode = cast(Op.getNode()); if (BinNode->Flags.hasNoSignedWrap()) break; } default: NeedOF = true; break; } break; } } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit a CMP with 0, which is the TEST pattern. //if (Op.getValueType() == MVT::i1) // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, // DAG.getConstant(0, MVT::i1)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; // Truncate operations may prevent the merge of the SETCC instruction // and the arithmetic instruction before it. Attempt to truncate the operands // of the arithmetic instruction and use a reduced bit-width instruction. bool NeedTruncation = false; SDValue ArithOp = Op; if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { SDValue Arith = Op->getOperand(0); // Both the trunc and the arithmetic op need to have one user each. if (Arith->hasOneUse()) switch (Arith.getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::OR: case ISD::XOR: { NeedTruncation = true; ArithOp = Arith; } } } // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::ADD: // Due to an isel shortcoming, be conservative if this add is likely to be // selected as part of a load-modify-store instruction. When the root node // in a match is a store, isel doesn't know how to remap non-chain non-flag // uses of other nodes in the match, such as the ADD in this case. This // leads to the ADD being left around and reselected, with the result being // two adds in the output. Alas, even if none our users are stores, that // doesn't prove we're O.K. Ergo, if we have any parents that aren't // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require // climbing the DAG back to the root, and it doesn't seem to be worth the // effort. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC && UI->getOpcode() != ISD::STORE) goto default_case; if (ConstantSDNode *C = dyn_cast(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. if (C->isOne() && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; } } // Otherwise use a regular EFLAGS-setting add. Opcode = X86ISD::ADD; NumOperands = 2; break; case ISD::SHL: case ISD::SRL: // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && isa(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); unsigned ShAmt = Op->getConstantOperandVal(1); if (ShAmt >= BitWidth) // Avoid undefined shifts. break; APInt Mask = ArithOp.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); if (!Mask.isSignedIntN(32)) // Avoid large immediates. break; SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, dl, VT)); DAG.ReplaceAllUsesWith(Op, New); Op = New; } break; case ISD::AND: // If the primary and result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. if (!hasNonFlagsUse(Op)) break; // FALL THROUGH case ISD::SUB: case ISD::OR: case ISD::XOR: // Due to the ISEL shortcoming noted above, be conservative if this op is // likely to be selected as part of a load-modify-store instruction. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() == ISD::STORE) goto default_case; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); if (EFLAGS.getNode()) return EFLAGS; } Opcode = X86ISD::OR; break; } } NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::INC: case X86ISD::DEC: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); default: default_case: break; } // If we found that truncation is beneficial, perform the truncation and // update 'Op'. if (NeedTruncation) { EVT VT = Op.getValueType(); SDValue WideVal = Op->getOperand(0); EVT WideVT = WideVal.getValueType(); unsigned ConvertedOp = 0; // Use a target machine opcode to prevent further DAGCombine // optimizations that may separate the arithmetic operations // from the setcc node. switch (WideVal.getOpcode()) { default: break; case ISD::ADD: ConvertedOp = X86ISD::ADD; break; case ISD::SUB: ConvertedOp = X86ISD::SUB; break; case ISD::AND: ConvertedOp = X86ISD::AND; break; case ISD::OR: ConvertedOp = X86ISD::OR; break; case ISD::XOR: ConvertedOp = X86ISD::XOR; break; } if (ConvertedOp) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); } } } if (Opcode == 0) // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG); assert(!(isa(Op1) && Op0.getValueType() == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { // Do the comparison at i32 if it's smaller, besides the Atom case. // This avoids subregister aliasing issues. Keep the smaller reference // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && !DAG.getMachineFunction().getFunction()->optForMinSize() && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return SDValue(Sub.getNode(), 1); } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } /// Convert a comparison if required by the subtarget. SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. if (Subtarget->hasCMov() || Cmp.getOpcode() != X86ISD::CMP || !Cmp.getOperand(0).getValueType().isFloatingPoint() || !Cmp.getOperand(1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); // Some 64-bit targets lack SAHF support, but they do support FCOMI. assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { EVT VT = Op.getValueType(); const char *RecipOp; // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. if (VT == MVT::f32 && Subtarget->hasSSE1()) RecipOp = "sqrtf"; else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || (VT == MVT::v8f32 && Subtarget->hasAVX())) RecipOp = "vec-sqrtf"; else return SDValue(); TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; if (!Recips.isEnabled(RecipOp)) return SDValue(); RefinementSteps = Recips.getRefinementSteps(RecipOp); UseOneConstNR = false; return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { EVT VT = Op.getValueType(); const char *RecipOp; // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. if (VT == MVT::f32 && Subtarget->hasSSE1()) RecipOp = "divf"; else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || (VT == MVT::v8f32 && Subtarget->hasAVX())) RecipOp = "vec-divf"; else return SDValue(); TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; if (!Recips.isEnabled(RecipOp)) return SDValue(); RefinementSteps = Recips.getRefinementSteps(RecipOp); return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } /// If we have at least two divisions that use the same divisor, convert to /// multplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, SDLoc dl, SelectionDAG &DAG) const { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::TRUNCATE) Op1 = Op1.getOperand(0); SDValue LHS, RHS; if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { APInt Zeros, Ones; DAG.computeKnownBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; RHS = Op0.getOperand(1); } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); } // Use BT if the immediate can't be encoded in a TEST instruction. if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { LHS = AndLHS; RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType()); } } if (LHS.getNode()) { // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (LHS.getValueType() == MVT::i8 || LHS.getValueType() == MVT::i16) LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); // If the operand types disagree, extend the shift amount to match. Since // BT ignores high bits (like shifts) we can use anyextend. if (LHS.getValueType() != RHS.getValueType()) RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, dl, MVT::i8), BT); } return SDValue(); } /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point /// mask CMPs. static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1) { unsigned SSECC; bool Swap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: case ISD::SETGT: Swap = true; // Fallthrough case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: case ISD::SETGE: Swap = true; // Fallthrough case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; case ISD::SETULE: Swap = true; // Fallthrough case ISD::SETUGE: SSECC = 5; break; case ISD::SETULT: Swap = true; // Fallthrough case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: case ISD::SETONE: SSECC = 8; break; } if (Swap) std::swap(Op0, Op1); return SSECC; } // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, DAG.getConstant(-1, dl, VT)); SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, DAG.getConstant(-1, dl, VT)); switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETEQ: // (x == y) -> ~(x ^ y) return DAG.getNode(ISD::XOR, dl, VT, DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), DAG.getConstant(-1, dl, VT)); case ISD::SETNE: // (x != y) -> (x ^ y) return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); case ISD::SETUGT: case ISD::SETGT: // (x > y) -> (x & ~y) return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); case ISD::SETULT: case ISD::SETLT: // (x < y) -> (~x & y) return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); case ISD::SETULE: case ISD::SETLE: // (x <= y) -> (~x | y) return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); case ISD::SETUGE: case ISD::SETGE: // (x >=y) -> (x | ~y) return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); } } static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); unsigned Opc = 0; bool Unsigned = false; bool Swap = false; unsigned SSECC; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: SSECC = 4; break; case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; case ISD::SETUGT: SSECC = 6; Unsigned = true; break; case ISD::SETLT: Swap = true; //fall-through case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; case ISD::SETULT: SSECC = 1; Unsigned = true; break; case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap case ISD::SETULE: Unsigned = true; //fall-through case ISD::SETLE: SSECC = 2; break; } if (Swap) std::swap(Op0, Op1); if (Opc) return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, dl, MVT::i8)); } /// \brief Try to turn a VSETULT into a VSETULE by modifying its second /// operand \p Op1. If non-trivial (for example because it's not constant) /// return an empty value. static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) { BuildVectorSDNode *BV = dyn_cast(Op1.getNode()); if (!BV) return SDValue(); MVT VT = Op1.getSimpleValueType(); MVT EVT = VT.getVectorElementType(); unsigned n = VT.getVectorNumElements(); SmallVector ULTOp1; for (unsigned i = 0; i < n; ++i) { ConstantSDNode *Elt = dyn_cast(BV->getOperand(i)); if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) return SDValue(); // Avoid underflow. APInt Val = Elt->getAPIntValue(); if (Val == 0) return SDValue(); ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { #ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); unsigned Opc = X86ISD::CMPP; if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = X86ISD::CMPM; } // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { unsigned CC0, CC1; unsigned CombineOpc; if (SetCCOpcode == ISD::SETUEQ) { CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; } else { assert(SetCCOpcode == ISD::SETONE); CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, dl, MVT::i8)); return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, dl, MVT::i8)); } MVT VTOp0 = Op0.getSimpleValueType(); assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); if (VT.is128BitVector() && VTOp0.is256BitVector()) { // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the // legalizer firstly checks if the first operand in input to the setcc has // a legal type. If so, then it promotes the return type to that same type. // Otherwise, the return type is promoted to the 'next legal type' which, // for a vector of MVT::i1 is always a 128-bit integer vector type. // // We reach this code only if the following two conditions are met: // 1. Both return type and operand type have been promoted to wider types // by the type legalizer. // 2. The original operand type has been promoted to a 256-bit vector. // // Note that condition 2. only applies for AVX targets. SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); return DAG.getZExtOrTrunc(NewOp, dl, VT); } // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget->hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); bool MaskResult = (VT.getVectorElementType() == MVT::i1); if (Subtarget->hasAVX512()) { if (Op1.getSimpleValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. // We are not talking about 512-bit operands in this case, these // types are illegal. if (MaskResult && (OpVT.getVectorElementType().getSizeInBits() < 32 && OpVT.getVectorElementType().getSizeInBits() >= 8)) return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } // Lower using XOP integer comparisons. if ((VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; case ISD::SETULE: case ISD::SETLE: CmpMode = 0x01; break; case ISD::SETUGT: case ISD::SETGT: CmpMode = 0x02; break; case ISD::SETUGE: case ISD::SETGE: CmpMode = 0x03; break; case ISD::SETEQ: CmpMode = 0x04; break; case ISD::SETNE: CmpMode = 0x05; break; } // Are we comparing unsigned or signed integers? unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CmpMode, dl, MVT::i8)); } // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc; bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; bool Subus = false; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; case ISD::SETGT: Opc = X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; case ISD::SETULT: Swap = true; case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; case ISD::SETUGE: Swap = true; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); bool hasMinMax = (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) || (Subtarget->hasSSE2() && (VET == MVT::i8)); if (hasMinMax) { switch (SetCCOpcode) { default: break; case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } } bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); if (!MinMax && hasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 // pcmpeq t, <0..0> switch (SetCCOpcode) { default: break; case ISD::SETULT: { // If the comparison is against a constant we can turn this into a // setule. With psubus, setule does not require a swap. This is // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget->hasAVX()) break; SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); if (ULEOp1.getNode()) { Op1 = ULEOp1; Subus = true; Invert = false; Swap = false; } break; } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; } if (Subus) { Opc = X86ISD::SUBUS; FlipSigns = false; } } if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { assert(Subtarget->hasSSE2() && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. SDValue SB; if (FlipSigns) { SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32); } else { SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Sign, Zero, Sign, Zero); } Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. static const int MaskHi[] = { 1, 1, 3, 3 }; static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); if (MinMax) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); if (Subus) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, getZeroVector(VT, Subtarget, DAG, dl)); return Result; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(2))->get(); // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; } } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; CCode = X86::GetOppositeBranchCondition(CCode); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(CCode, dl, MVT::i8), Op0.getOperand(1)); if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); return SetCC; } } if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); } bool isFP = Op1.getSimpleValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); return SetCC; } SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); assert(Carry.getOpcode() != ISD::CARRY_FALSE); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) return true; return false; } static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); int SSECC = translateX86FSETCC( cast(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (SSECC != 8) { if (Subtarget->hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); } SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. // If either operand is a constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. if (Subtarget->hasAVX() && !isa(Op1) && !isa(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) Op1Scalar = Op1.getOperand(0); SDValue Op2Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { SDValue newSelect = DAG.getNode(ISD::SELECT, DL, Op1Scalar.getValueType(), Cond, Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, DAG.getIntPtrConstant(0, DL)); } } if (VT == MVT::v4i1 || VT == MVT::v2i1) { SDValue zeroConst = DAG.getIntPtrConstant(0, DL); Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, Cond, Op1, Op2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } if (Cond.getOpcode() == ISD::SETCC) { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) Cond = NewCond; } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast(Cond.getOperand(0))->getZExtValue(); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, DL, CmpOp0.getValueType()), CmpOp0); SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); return Res; } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } } // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME Cond = Cmp; addTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && Cond.getOperand(0).getValueType() != MVT::i8)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); } if (CondOpcode == ISD::UMULO) VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32); else VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); if (CondOpcode == ISD::UMULO) Cond = X86Op.getValue(2); else Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, DL, MVT::i8); addTest = false; } if (addTest) { // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; } } } if (addTest) { CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } } // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Blacklist CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); MVT VTElt = VT.getVectorElementType(); MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); // SKX processor if ((InVTElt == MVT::i1) && (((Subtarget->hasBWI() && Subtarget->hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || ((Subtarget->hasBWI() && VT.is512BitVector() && VTElt.getSizeInBits() <= 16)) || ((Subtarget->hasDQI() && Subtarget->hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || ((Subtarget->hasDQI() && VT.is512BitVector() && VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned int NumElts = VT.getVectorNumElements(); if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; SDValue NegOne = DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); if (VT.is512BitVector()) return V; return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); assert(VT.getSizeInBits() == InVT.getSizeInBits()); MVT InSVT = InVT.getVectorElementType(); assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); SDLoc dl(Op); // SSE41 targets can use the pmovsx* instructions directly. if (Subtarget->hasSSE41()) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; MVT CurrVT = InVT; // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); Curr = DAG.getBitcast(CurrVT, Curr); } SDValue SignExt = Curr; if (CurrVT != InVT) { unsigned SignExtShift = CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } if (CurrVT == VT) return SignExt; if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(31, dl, MVT::i8)); SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); return DAG.getBitcast(VT, Ext); } return SDValue(); } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && (VT != MVT::v16i16 || InVT != MVT::v16i8)) return SDValue(); if (Subtarget->hasInt256()) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and // v4i32 to v4i64 // // Divide input vector into two parts // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT unsigned NumElems = InVT.getVectorNumElements(); SDValue Undef = DAG.getUNDEF(InVT); SmallVector ShufMask1(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask1[i] = i; SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); SmallVector ShufMask2(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask2[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector sext loads."); assert(RegVT.isInteger() && "We only custom lower integer vector sext loads."); // Nothing useful we can do without SSE2 shuffles. assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); ISD::LoadExtType Ext = Ld->getExtensionType(); assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."); assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { // The only way in which we have a legal 256-bit vector result but not the // integer 256-bit operations needed to directly lower a sextload is if we // have AVX1 but not AVX2. In that case, we can always emit a sextload to // a 128-bit vector and a normal sign_extend to 256-bits that should get // correctly legalized. We do this late to allow the canonical form of // sextload to persist throughout the rest of the DAG combiner -- it wants // to fold together any extensions it can, and so will fuse a sign_extend // of an sextload into a sextload targeting a wider value. SDValue Load; if (MemSz == 128) { // Just switch this to a normal load. assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!"); Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); } else { assert(MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"); // Do an sext load to a 128-bit vector type. We want to use the same // number of elements, but elements half as wide. This will end up being // recursively lowered by this routine, but will succeed as we definitely // have all the necessary features if we're using AVX1. EVT HalfEltVT = EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); Load = DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); } // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); // Finally, do a normal sign-extend to the desired register. return DAG.getSExtOrTrunc(Load, dl, RegVT); } // All sizes must be a power of two. assert(isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"); // Attempt to load the original value using scalar loads. // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; } } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && (64 <= MemSz)) SclrLoadTy = MVT::f64; // Calculate the number of scalar loads that we need to perform // in order to load our vector from memory. unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"); unsigned loadRegZize = RegSz; if (Ext == ISD::SEXTLOAD && RegSz >= 256) loadRegZize = 128; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), loadRegZize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); // We can't shuffle using an illegal type. assert(TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"); SmallVector Chains; SDValue Ptr = Ld->getBasePtr(); SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); for (unsigned i = 0; i < NumLoads; ++i) { // Perform a single load. SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); Chains.push_back(ScalarLoad.getValue(1)); // Create the first element type using SCALAR_TO_VECTOR in order to avoid // another round of DAGCombining. if (i == 0) Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); else Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, ScalarLoad, DAG.getIntPtrConstant(i, dl)); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget->hasSSE41()) { SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest // lanes. assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i * SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); // Bitcast to the requested type. Shuff = DAG.getBitcast(RegVT, Shuff); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart // from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) return false; return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse() && Op.getOperand(1).getOpcode() == X86ISD::SETCC && Op.getOperand(1).hasOneUse()); } // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and // 1 and that the SETCC node has a single use. static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse(); return false; } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); SDValue CC; bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || Cond.getOperand(0).getOpcode() == ISD::SSUBO || Cond.getOperand(0).getOpcode() == ISD::USUBO || Cond.getOperand(0).getOpcode() == ISD::SMULO || Cond.getOperand(0).getOpcode() == ISD::UMULO)) { Inverted = true; Cond = Cond.getOperand(0); } else { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) Cond = NewCond; } } #if 0 // FIXME: LowerXALUO doesn't handle these!! else if (Cond.getOpcode() == X86ISD::ADD || Cond.getOpcode() == X86ISD::SUB || Cond.getOpcode() == X86ISD::SMUL || Cond.getOpcode() == X86ISD::UMUL) Cond = LowerXALUO(Cond, DAG); #endif // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { Cond = Cmp; addTest = false; } else { switch (cast(CC)->getZExtValue()) { default: break; case X86::COND_O: case X86::COND_B: // These can only come from an arithmetic instruction with overflow, // e.g. SADDO, UADDO. Cond = Cond.getNode()->getOperand(1); addTest = false; break; } } } CondOpcode = Cond.getOpcode(); if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && Cond.getOperand(0).getValueType() != MVT::i8)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; // Keep this in sync with LowerXALUO, otherwise we might create redundant // instructions that can't be removed afterwards (i.e. X86ISD::ADD and // X86ISD::INC). switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); } if (Inverted) X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); if (CondOpcode == ISD::UMULO) VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32); else VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); if (CondOpcode == ISD::UMULO) Cond = X86Op.getValue(2); else Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { SDValue Cmp = Cond.getOperand(0).getOperand(1); if (CondOpc == ISD::OR) { // Also, recognize the pattern generated by an FCMP_UNE. We can emit // two branches instead of an explicit OR instruction with a // separate test. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp)) { CC = Cond.getOperand(0).getOperand(0); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = Cond.getOperand(1).getOperand(0); Cond = Cmp; addTest = false; } } else { // ISD::AND // Also, recognize the pattern generated by an FCMP_OEQ. We can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); X86::CondCode CCode = (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cmp; addTest = false; } } } } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. // It should be transformed during dag combiner except when the condition // is set by a arithmetics with overflow node. X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } } } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_UNE. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8); Cond = Cmp; addTest = false; Dest = FalseBB; } } } } if (addTest) { // Look pass the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; } } } if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getConstant(X86Cond, dl, MVT::i8); Cond = EmitTest(Cond, X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || SplitStack; SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); unsigned Align = cast(Op.getOperand(2))->getZExtValue(); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); bool Is64Bit = Subtarget->is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); SDValue Tmp3 = Node->getOperand(2); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function *F = MF.getFunction(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) if (I->hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Align) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } Result = SP; } Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); SDValue Ops[2] = {Result, Chain}; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); if (!Subtarget->is64Bit() || Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); } // __va_list_tag: // gp_offset (0 - 6 * 8) // fp_offset (48 - 48 + 8 * 16) // overflow_arg_area (point to parameters coming in memory). // reg_save_area SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV), false, false, 0); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV, 4), false, false, 0); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8), false, false, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNode()->getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. if (ArgVT == MVT::f80) { llvm_unreachable("va_arg for f80 not yet implemented"); } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } else { llvm_unreachable("Unhandled argument type in LowerVAARG"); } if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget->useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, /*Volatile=*/false, /*ReadMem=*/true, /*WriteMem=*/true); Chain = VAARG.getValue(1); // Load the next argument and return it return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo(), false, false, false, 0); } static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, // where a va_list is still an i8*. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget->isCallingConvWin64( DAG.getMachineFunction().getFunction()->getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } // getTargetVShiftByConstNode - Handle vector element shifts where the shift // amount is a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, dl, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. if (VT == SrcOp.getSimpleValueType() && ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector Elts; unsigned NumElts = SrcOp->getNumOperands(); ConstantSDNode *ND; switch(Opc) { default: llvm_unreachable(nullptr); case X86ISD::VSHLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->getOpcode() == ISD::UNDEF) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->getOpcode() == ISD::UNDEF) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->getOpcode() == ISD::UNDEF) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } break; } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, dl, MVT::i8)); } // getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast(ShAmt)) return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, CShAmt->getZExtValue(), DAG); // Change opcode to non-immediate version switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } const X86Subtarget &Subtarget = static_cast(DAG.getSubtarget()); if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { // Let the shuffle legalizer expand this shift amount node. SDValue Op0 = ShAmt.getOperand(0); Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); } else { // Need to build a vector containing shift amount. // SSE/AVX packed shifts only use the lower 64-bit of the shift count. SmallVector ShOps; ShOps.push_back(ShAmt); if (SVT == MVT::i32) { ShOps.push_back(DAG.getConstant(0, dl, SVT)); ShOps.push_back(DAG.getUNDEF(SVT)); } ShOps.push_back(DAG.getUNDEF(SVT)); MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); } // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } /// \brief Return Mask with the necessary casting or extending /// for \p Mask according to \p MaskVT when lowering masking intrinsics static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { if (MaskVT.bitsGT(Mask.getSimpleValueType())) { // Mask should be extended Mask = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); } if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { if (MaskVT == MVT::v64i1) { assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); // In case 32bit mode, bitcast i64 is illegal, extend/split it. SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, DAG.getConstant(0, dl, MVT::i32)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, DAG.getConstant(1, dl, MVT::i32)); Lo = DAG.getBitcast(MVT::v32i1, Lo); Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } else { // MaskVT require < 64bit. Truncate mask (should succeed in any case), // and bitcast. MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); return DAG.getBitcast(MaskVT, DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); } } else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); } } /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); if (isAllOnesConstant(Mask)) return Op; SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); switch (Op.getOpcode()) { default: break; case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: case X86ISD::CMPMU: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); case X86ISD::VFPCLASS: case X86ISD::VFPCLASSS: return DAG.getNode(ISD::OR, dl, VT, Op, VMask); case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: // We can't use ISD::VSELECT here because it is not always "Legal" // for the destination type. For example vpmovqb require only AVX512 // and vselect that can operate on byte element type require BWI OpcodeSelect = X86ISD::SELECT; break; } if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). /// The mask is coming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (isAllOnesConstant(Mask)) return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); // The mask should be of type MVT::i1 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); if (Op.getOpcode() == X86ISD::FSETCC) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (Op.getOpcode() == X86ISD::VFPCLASS || Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { if (!Fn->hasPersonalityFn()) report_fatal_error( "querying registration node size for function without personality"); // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See // WinEHStatePass for the full struct definition. switch (classifyEHPersonality(Fn->getPersonalityFn())) { case EHPersonality::MSVC_X86SEH: return 24; case EHPersonality::MSVC_CXX: return 16; default: break; } report_fatal_error( "can only recover FP for 32-bit MSVC EH personality functions"); } /// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize /// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP) { MachineFunction &MF = DAG.getMachineFunction(); SDLoc dl; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // It's possible that the parent function no longer has a personality function // if the exceptional code was optimized away, in which case we just return // the incoming EBP. if (!Fn->hasPersonalityFn()) return EntryEBP; // Get an MCSymbol that will ultimately resolve to the frame offset of the EH // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after // prologue to RBP in the parent function. const X86Subtarget &Subtarget = static_cast(DAG.getSubtarget()); if (Subtarget.is64Bit()) return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case INTR_TYPE_2OP_IMM8: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case INTR_TYPE_1OP_MASK_RM: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode; // We allways add rounding mode to the Node. // If the rounding mode is not specified, we add the // "current direction" mode. if (Op.getNumOperands() == 4) RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); else RoundingMode = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) if (cast(RoundingMode)->getZExtValue() != X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, RoundingMode), Mask, PassThru, Subtarget, DAG); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when // - RM Opcode is specified and // - RM is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); unsigned Round = cast(Rnd)->getZExtValue(); if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, Rnd), Mask, PassThru, Subtarget, DAG); } } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // There are 2 kinds of intrinsics in this group: // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0; return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Sae), Mask, Src0, Subtarget, DAG); } assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); SDValue Sae = Op.getOperand(6); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: case INTR_TYPE_2OP_IMM8_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); unsigned Round = cast(Rnd)->getZExtValue(); if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } } // TODO: Intrinsics should have fast-math-flags to propagate. return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // We specify 2 possible modes for intrinsics, with/without rounding // modes. // First, we check if the intrinsic have rounding mode (6 operands), // if not, we set rounding mode to "current". SDValue Rnd; if (Op.getNumOperands() == 6) Rnd = Op.getOperand(5); else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Sae = Op.getOperand(6); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3, Sae), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Imm = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); // We specify 2 possible modes for intrinsics, with/without rounding // modes. // First, we check if the intrinsic have rounding mode (7 operands), // if not, we set rounding mode to "current". SDValue Rnd; if (Op.getNumOperands() == 7) Rnd = Op.getOperand(6); else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Imm, Rnd), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_IMM8_MASK: case INTR_TYPE_3OP_MASK: case INSERT_SUBVEC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); else if (IntrData->Type == INSERT_SUBVEC) { // imm should be adapted to ISD::INSERT_SUBVECTOR behavior assert(isa(Src3) && "Expected a ConstantSDNode here!"); unsigned Imm = cast(Src3)->getZExtValue(); Imm *= Src2.getSimpleValueType().getVectorNumElements(); Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); } // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(6); unsigned Round = cast(Rnd)->getZExtValue(); if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), Mask, PassThru, Subtarget, DAG); } } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: case VPERM_3OP_MASK:{ // Src2 is the PassThru SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element if (IntrData->Type == VPERM_3OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else PassThru = DAG.getBitcast(VT, Src2); // Swap Src1 and Src2 in the node creation return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src2, Src1, Src3), Mask, PassThru, Subtarget, DAG); } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; else PassThru = Src1; // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); if (cast(Rnd)->getZExtValue() != X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), Mask, PassThru, Subtarget, DAG); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case TERLOG_OP_MASK: case TERLOG_OP_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); SDValue Mask = Op.getOperand(5); MVT VT = Op.getSimpleValueType(); SDValue PassThru = Src1; // Set PassThru element. if (IntrData->Type == TERLOG_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3, Src4), Mask, PassThru, Subtarget, DAG); } case FPCLASS: { // FPclass intrinsics with mask SDValue Src1 = Op.getOperand(1); MVT VT = Src1.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MaskVT), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), FPclassMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. // Example of transformation: // (i8 (int_x86_avx512_mask_pcmpeq_q_128 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> // (i8 (bitcast // (v8i1 (insert_subvector undef, // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) MVT VT = Op.getOperand(1).getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { SDValue CC = Op.getOperand(3); CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (cast(Rnd)->getZExtValue() != X86::STATIC_ROUNDING::CUR_DIRECTION) Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC, Rnd); } //default rounding mode if(!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC); } else { assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2)); } SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MaskVT), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); SDValue Mask = Op.getOperand(4); SDValue Cmp; if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (cast(Rnd)->getZExtValue() != X86::STATIC_ROUNDING::CUR_DIRECTION) Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), DAG.getValueType(MVT::i1)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG); assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDValue CC = Op.getOperand(3); SDValue Sae = Op.getOperand(4); auto ComiType = TranslateX86ConstCondToX86CC(CC); // choose between ordered and unordered (comi/ucomi) unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; SDValue Cond; if (cast(Sae)->getZExtValue() != X86::STATIC_ROUNDING::CUR_DIRECTION) Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); else Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); case VSHIFT_MASK: return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG), Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, PassThru, Subtarget, DAG); } case BROADCASTM: { SDValue Mask = Op.getOperand(1); MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } case BLEND: { SDValue Mask = Op.getOperand(3); MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } case KUNPCK: { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); // Arguments should be swapped. SDValue Res = DAG.getNode(IntrData->Opc0, dl, MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), Src2, Src1); return DAG.getBitcast(VT, Res); } case CONVERT_TO_MASK: { MVT SrcVT = Op.getOperand(1).getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1)); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CvtMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case CONVERT_MASK_TO_VEC: { SDValue Mask = Op.getOperand(1); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask); } case BRCST_SUBVEC_TO_VEC: { SDValue Src = Op.getOperand(1); SDValue Passthru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); EVT resVT = Passthru.getValueType(); SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, DAG.getUNDEF(resVT), Src, DAG.getIntPtrConstant(0, dl)); SDValue immVal; if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) immVal = DAG.getConstant(0x44, dl, MVT::i8); else immVal = DAG.getConstant(0, dl, MVT::i8); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, subVec, subVec, immVal), Mask, Passthru, Subtarget, DAG); } default: break; } } switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, // but second operand for node/instruction. return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestz_256: case Intrinsic::x86_avx_ptestc_256: case Intrinsic::x86_avx_ptestnzc_256: case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { bool IsTestPacked = false; unsigned X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: // ZF = 1 X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: // CF = 1 X86CC = X86::COND_B; break; case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 X86CC = X86::COND_A; break; } SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_avx512_kortestz_w: case Intrinsic::x86_avx512_kortestc_w: { unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: case Intrinsic::x86_sse42_pcmpestric128: case Intrinsic::x86_sse42_pcmpistrio128: case Intrinsic::x86_sse42_pcmpestrio128: case Intrinsic::x86_sse42_pcmpistris128: case Intrinsic::x86_sse42_pcmpestris128: case Intrinsic::x86_sse42_pcmpistriz128: case Intrinsic::x86_sse42_pcmpestriz128: { unsigned Opcode; unsigned X86CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_E; break; } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, dl, MVT::i8), SDValue(PCMP.getNode(), 1)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistri128: case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) Opcode = X86ISD::PCMPISTRI; else Opcode = X86ISD::PCMPESTRI; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_seh_lsda: { // Compute the symbol for the LSDA. We know it'll get emitted later. MachineFunction &MF = DAG.getMachineFunction(); SDValue Op1 = Op.getOperand(1); auto *Fn = cast(cast(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( GlobalValue::getRealLinkageName(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. SDValue Result = DAG.getMCSymbol(LSDASym, VT); return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); } case Intrinsic::x86_seh_recoverfp: { SDValue FnOp = Op.getOperand(1); SDValue IncomingFPOp = Op.getOperand(2); GlobalAddressSDNode *GSD = dyn_cast(FnOp); auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); if (!Fn) report_fatal_error( "llvm.x86.seh.recoverfp must take a function as the first argument"); return recoverFramePointer(DAG, Fn, IncomingFPOp); } case Intrinsic::localaddress: { // Returns one of the stack, base, or frame pointer registers, depending on // which is used to reference local variables. MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); else // This function handles the SP or FP case. Reg = RegInfo->getPtrSizedFrameRegister(MF); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } } } static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); } SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); } SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else MaskInReg = DAG.getBitcast(MaskVT, Mask); //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that // read performance monitor counters (x86_rdpmc). static void getReadPerformanceCounter(SDNode *N, SDLoc DL, SelectionDAG &DAG, const X86Subtarget *Subtarget, SmallVectorImpl &Results) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue LO, HI; // The ECX register is used to select the index of the performance counter // to read. SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); // Reads the content of a 64-bit performance counter and returns it in the // registers EDX:EAX. if (Subtarget->is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } Chain = HI.getValue(1); if (Subtarget->is64Bit()) { // The EAX register is loaded with the low-order 32 bits. The EDX register // is loaded with the supported high-order bits of the counter. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is // also used to custom lower READCYCLECOUNTER nodes. static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget *Subtarget, SmallVectorImpl &Results) { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); SDValue LO, HI; // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. if (Subtarget->is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } SDValue Chain = HI.getValue(1); if (Opcode == X86ISD::RDTSCP_DAG) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into // the ECX register. Add 'ecx' explicitly to the chain. SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, HI.getValue(2)); // Explicitly store the content of ECX at the location passed in input // to the 'rdtscp' intrinsic. Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), MachinePointerInfo(), false, false, 0); } if (Subtarget->is64Bit()) { // The EDX register is loaded with the high-order 32 bits of the MSR, and // the EAX register is loaded with the low-order 32 bits. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallVector Results; SDLoc DL(Op); getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue RegNode = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EH registrations only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(RegNode); if (!FINode) report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } /// \brief Lower intrinsics for TRUNCATE_TO_MEM case /// return truncate Store/MaskedStore Node static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, SelectionDAG &DAG, MVT ElementType) { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToTruncate = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = DataToTruncate.getSimpleValueType(); MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); if (isAllOnesConstant(Mask)) // return just a truncate store return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MachinePointerInfo(), SVT, false, false, SVT.getScalarSizeInBits()/8); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore, SVT.getStoreSize(), SVT.getScalarSizeInBits()/8); return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, SVT, MMO, true); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) return MarkEHRegistrationNode(Op, DAG); if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || IntNo == llvm::Intrinsic::x86_flags_read_u64 || IntNo == llvm::Intrinsic::x86_flags_write_u32 || IntNo == llvm::Intrinsic::x86_flags_write_u64) { // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during ExpandISelPseudos in EmitInstrWithCustomInserter. return SDValue(); } return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, dl, Op->getValueType(1)), DAG.getConstant(X86::COND_B, dl, MVT::i32), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); } case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal = cast(Hint)->getZExtValue(); assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector Results; getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. case RDPMC: { SmallVector Results; getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_NE, dl, MVT::i8), InTrans); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } // ADC/ADCX/SBB case ADX: { SmallVector Results; SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), DAG.getConstant(-1, dl, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), Op.getOperand(4), GenCF.getValue(1)); SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), Op.getOperand(5), MachinePointerInfo(), false, false, 0); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_B, dl, MVT::i8), Res.getValue(1)); Results.push_back(SetCC); Results.push_back(Store); return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = DataToCompress.getSimpleValueType(); if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); SDValue Compressed = getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, DAG.getUNDEF(VT), Subtarget, DAG); return DAG.getStore(Chain, dl, Compressed, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); } case TRUNCATE_TO_MEM_VI8: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); case TRUNCATE_TO_MEM_VI16: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); case TRUNCATE_TO_MEM_VI32: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); SDValue Results[] = { getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } case LOADU: case LOADA: { SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, MemIntr->getMemOperand(), ISD::NON_EXTLOAD); } } } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo(), false, false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); EVT VT = Op.getValueType(); MFI->setFrameAddressIsTaken(true); if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { // Depth > 0 makes no sense on targets which use Windows unwind codes. It // is not possible to crawl up the stack without looking at the unwind codes // simultaneously. int FrameAddrIndex = FuncInfo->getFAIndex(); if (!FrameAddrIndex) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( SlotSize, /*Offset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); } unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), false, false, false, 0); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); const MachineFunction &MF = DAG.getMachineFunction(); unsigned Reg = StringSwitch(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Case("ebp", X86::EBP) .Case("rbp", X86::RBP) .Default(0); if (Reg == X86::EBP || Reg == X86::RBP) { if (!TFI.hasFP(MF)) report_fatal_error("register " + StringRef(RegName) + " is allocatable: function has no frame pointer"); #ifndef NDEBUG else { const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"); } #endif } if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } unsigned X86TargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; } unsigned X86TargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Funclet personalities don't use selectors (the runtime does the selection). assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc dl (Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl (Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; // Large code-model. const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), false, false, 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, dl, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 10), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), false, false, 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, dl, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 20), false, false, 0); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, dl, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 22), false, false, 0); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); CallingConv::ID CC = Func->getCallingConv(); unsigned NestReg; switch (CC) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::X86_StdCall: { // Pass 'nest' parameter in ECX. // Must be kept in sync with X86CallingConv.td NestReg = X86::ECX; // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); const AttributeSet &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; unsigned Idx = 1; for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasAttribute(Idx, Attribute::InReg)) { auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" " parameters!"); } } break; } case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; break; } SDValue OutChains[4]; SDValue Addr, Disp; Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(10, dl, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), false, false, 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 5), false, false, 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), false, false, 1); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: 00 Round to nearest 01 Round to -inf 10 Round to +inf 11 Round to 0 FLT_ROUNDS, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we do: (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) */ MachineFunction &MF = DAG.getMachineFunction(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo(), false, false, false, 0); // Transform as necessary SDValue CWD1 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x800, DL, MVT::i16)), DAG.getConstant(11, DL, MVT::i8)); SDValue CWD2 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x400, DL, MVT::i16)), DAG.getConstant(9, DL, MVT::i8)); SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i16, DAG.getNode(ISD::ADD, DL, MVT::i16, DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), DAG.getConstant(1, DL, MVT::i16)), DAG.getConstant(3, DL, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. // // 1. i32/i64 128/256-bit vector (native support require VLX) are expended // to 512-bit vector. // 2. i8/i16 vector implemented using dword LZCNT vector instruction // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, // split the vector, perform operation on it's Lo a Hi part and // concatenate the results. static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); if (EltVT == MVT::i64 || EltVT == MVT::i32) { // Extend to 512 bit vector. assert((VT.is256BitVector() || VT.is128BitVector()) && "Unsupported value type for operation"); MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, DAG.getIntPtrConstant(0, dl)); } assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); if (16 < NumElems) { // Split vector, it's Lo and Hi parts will be handled in next iteration. SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && "Unsupported value type for operation"); // Use native supported vector instruction vplzcntd. Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); } static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); if (VT.isVector() && Subtarget->hasAVX512()) return LowerVectorCTLZ_AVX512(Op, DAG); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse). SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); // And xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getScalarSizeInBits(); SDLoc dl(Op); if (VT.isVector()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue N0 = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, dl, VT); // lsb(x) = (x & -x) SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); // cttz_undef(x) = (width - 1) - ctlz(lsb) if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && TLI.isOperationLegal(ISD::CTLZ, VT)) { SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, DAG.getNode(ISD::CTLZ, dl, VT, LSB)); } // cttz(x) = ctpop(lsb - 1) SDValue One = DAG.getConstant(1, dl, VT); return DAG.getNode(ISD::CTPOP, dl, VT, DAG.getNode(ISD::SUB, dl, VT, LSB, One)); } assert(Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits, dl, VT), DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit // ones, and then concatenate the result back. static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { if (Op.getValueType() == MVT::i1) return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { if (Op.getValueType() == MVT::i1) return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); if (VT == MVT::i1) return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntArith(Op, DAG); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector // pairs, multiply and truncate. if (VT == MVT::v16i8 || VT == MVT::v32i8) { if (Subtarget->hasInt256()) { if (VT == MVT::v32i8) { MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2); SDValue Lo = DAG.getIntPtrConstant(0, dl); SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo); SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo); SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi); SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo), DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi)); } MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); return DAG.getNode( ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::MUL, dl, ExVT, DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); } assert(VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"); MVT ExVT = MVT::v8i16; // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; if (Subtarget->hasSSE41()) { ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); ALo = DAG.getBitcast(ExVT, ALo); BLo = DAG.getBitcast(ExVT, BLo); ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); } // Extract the hi parts and sign extend to i16 SDValue AHi, BHi; if (Subtarget->hasSSE41()) { const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); AHi = DAG.getBitcast(ExVT, AHi); BHi = DAG.getBitcast(ExVT, BHi); AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); } // Multiply, mask the lower 8bits of the lo/hi results and pack SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && "Should not custom lower when pmuldq is available!"); // Extract the odd parts. static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); // Multiply the even parts. SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); // Now multiply odd parts. SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); Evens = DAG.getBitcast(VT, Evens); Odds = DAG.getBitcast(VT, Odds); // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); // AloBhi = psllqi(AloBhi, 32); // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); SDValue AhiBlo = Ahi; SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getBitcast(MulVT, A); B = DAG.getBitcast(MulVT, B); Ahi = DAG.getBitcast(MulVT, Ahi); Bhi = DAG.getBitcast(MulVT, Bhi); SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); // After shifting right const values the result may be all-zero. if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); } if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); } SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { default: llvm_unreachable("Unexpected request for libcall!"); case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; } SDLoc dl(Op); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), false, false, 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); } static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); MVT VT = Op0.getSimpleValueType(); SDLoc dl(Op); assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. // E.g., PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> // // In other word, to have all the results, we need to perform two PMULxD: // 1. one with the even values. // 2. one with the odd values. // To achieve #2, with need to place the odd values at an even position. // // Place the odd value at an even position (basically, shift all values 1 // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; // => SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); // => SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. SDValue Highs, Lows; if (VT == MVT::v8i32) { const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } else { const int HighMask[] = {1, 5, 3, 7}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); const int LowMask[] = {0, 4, 2, 6}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget->hasSSE41()) { SDValue ShAmt = DAG.getConstant( 31, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } // The first result of MUL_LOHI is actually the low value, followed by the // high value. SDValue Ops[] = {Lows, Highs}; return DAG.getMergeValues(Ops, dl); } // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { if (VT.getScalarSizeInBits() < 16) return false; if (VT.is512BitVector() && (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI())) return true; bool LShift = VT.is128BitVector() || (VT.is256BitVector() && Subtarget->hasInt256()); bool AShift = LShift && (Subtarget->hasVLX() || (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } // The shift amount is a variable, but it is the same for all vector lanes. // These instructions are defined together with shift-immediate. static bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } // Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16) return false; // vXi16 supported only on AVX-512, BWI if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI()) return false; if (VT.is512BitVector() || Subtarget->hasVLX()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; return (Opcode == ISD::SRA) ? AShift : LShift; } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt - 32, DAG); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {9, 1, 11, 3, 13, 5, 15, 7}); } else { // SRA upper i32, SHL whole i64 and select lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); Lower = DAG.getBitcast(ExVT, Lower); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {8, 1, 10, 3, 12, 5, 14, 7}); } return DAG.getBitcast(VT, Ex); }; // Optimize shl/srl/sra with constant shift amount. if (auto *BVAmt = dyn_cast(Amt)) { if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); // i64 SRA needs to be performed as partial shifts. if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8) || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // Simple i8 add case if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) return DAG.getNode(ISD::ADD, dl, VT, R, R); // ashr(R, 7) === cmp_slt(R, 0) if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // XOP can shift v16i8 directly instead of as shift v8i16 + mask. if (VT == MVT::v16i8 && Subtarget->hasXOP()) return SDValue(); if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } llvm_unreachable("Unknown shift opcode."); } } } // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; if (ShuffleVectorSDNode *SVN = dyn_cast(Amt.getNode())) if (SVN->isSplat()) { SplatIndex = SVN->getSplatIndex(); Amt = Amt.getOperand(0); assert(SplatIndex < (int)VT.getVectorNumElements() && "Splat shuffle referencing second operand"); } if (Amt.getOpcode() != ISD::BITCAST || Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) return SDValue(); Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); for (unsigned i = 0; i != Ratio; ++i) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i + BaseOp)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } // Check remaining shift amounts (if not a splat). if (SplatIndex < 0) { for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { uint64_t ShAmt = 0; for (unsigned j = 0; j != Ratio; ++j) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i + j)); if (!C) return SDValue(); // 6 == Log2(64) ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } if (ShAmt != ShiftAmt) return SDValue(); } } if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); if (Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); } return SDValue(); } static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget* Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA; if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { SDValue BaseShAmt; MVT EltVT = VT.getVectorElementType(); if (BuildVectorSDNode *BV = dyn_cast(Amt)) { // Check if this build_vector node is doing a splat. // If so, then set BaseShAmt equal to the splat value. BaseShAmt = BV->getSplatValue(); if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); ShuffleVectorSDNode *SVN = dyn_cast(Amt); if (SVN && SVN->isSplat()) { unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && "Unexpected shuffle index found!"); BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast(InVec.getOperand(2))) { if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } } if (!BaseShAmt) // Avoid introducing an extract element from a shuffle. BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, DAG.getIntPtrConstant(SplatIdx, dl)); } } if (BaseShAmt.getNode()) { assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); } } // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); std::vector Vals(Ratio); for (unsigned i = 0; i != Ratio; ++i) Vals[i] = Amt.getOperand(i); for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { for (unsigned j = 0; j != Ratio; ++j) if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) return Op; // XOP has 128-bit variable logical/arithmetic shifts. // +ve/-ve Amt = shift left/right. if (Subtarget->hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) { if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); } if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); if (Op.getOpcode() == ISD::SRA) return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); } // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { // Splat the shift amounts so the scalar shifts above will catch it. SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } // i64 vector arithmetic shift can be emulated with the transform: // M = lshr(SIGN_BIT, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && Op.getOpcode() == ISD::SRA) { SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); R = DAG.getNode(ISD::XOR, dl, VT, R, M); R = DAG.getNode(ISD::SUB, dl, VT, R, M); return R; } // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. if (Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { SmallVector Elts; MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); APInt One(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i=0; i !=NumElems; ++i) { SDValue Op = Amt->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { Elts.push_back(Op); continue; } ConstantSDNode *ND = cast(Op); APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); } // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, dl, VT)); Op = DAG.getBitcast(MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } // If possible, lower this shift as a sequence of two shifts by // constant plus a MOVSS/MOVSD instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // // Could be rewritten as: // (v4i32 (MOVSS (srl A, ), (srl A, ))) // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing // the vector shift into four scalar shifts plus four pairs of vector // insert/extract. if ((VT == MVT::v8i16 || VT == MVT::v4i32) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { unsigned TargetOpcode = X86ISD::MOVSS; bool CanBeSimplified; // The splat value for the first packed shift (the 'X' from the example). SDValue Amt1 = Amt->getOperand(0); // The splat value for the second packed shift (the 'Y' from the example). SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); // See if it is possible to replace this node with a sequence of // two shifts followed by a MOVSS/MOVSD if (VT == MVT::v4i32) { // Check if it is legal to use a MOVSS. CanBeSimplified = Amt2 == Amt->getOperand(2) && Amt2 == Amt->getOperand(3); if (!CanBeSimplified) { // Otherwise, check if we can still simplify this node using a MOVSD. CanBeSimplified = Amt1 == Amt->getOperand(1) && Amt->getOperand(2) == Amt->getOperand(3); TargetOpcode = X86ISD::MOVSD; Amt2 = Amt->getOperand(2); } } else { // Do similar checks for the case where the machine value type // is MVT::v8i16. CanBeSimplified = Amt1 == Amt->getOperand(1); for (unsigned i=3; i != 8 && CanBeSimplified; ++i) CanBeSimplified = Amt2 == Amt->getOperand(i); if (!CanBeSimplified) { TargetOpcode = X86ISD::MOVSD; CanBeSimplified = true; Amt2 = Amt->getOperand(4); for (unsigned i=0; i != 4 && CanBeSimplified; ++i) CanBeSimplified = Amt1 == Amt->getOperand(i); for (unsigned j=4; j != 8 && CanBeSimplified; ++j) CanBeSimplified = Amt2 == Amt->getOperand(j); } } if (CanBeSimplified && isa(Amt1) && isa(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. MVT CastVT = MVT::v4i32; SDValue Splat1 = DAG.getConstant(cast(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = DAG.getConstant(cast(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) CastVT = MVT::v2i64; SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, BitCast1, DAG); return DAG.getBitcast(VT, Result); } } // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 // and shift using the SSE2 variable shifts. // The separate results can then be blended together. if (VT == MVT::v4i32) { unsigned Opc = Op.getOpcode(); SDValue Amt0, Amt1, Amt2, Amt3; if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); } else { // ISD::SHL is handled above but we include it here for completeness. switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case ISD::SHL: Opc = X86ISD::VSHL; break; case ISD::SRL: Opc = X86ISD::VSRL; break; case ISD::SRA: Opc = X86ISD::VSRA; break; } // The SSE2 shifts use the lower i64 as the same shift amount for // all lanes and the upper i64 is ignored. These shuffle masks // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); } SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. if (Subtarget->hasSSE41()) { V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); return DAG.getBitcast(SelVT, DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. Amt = DAG.getBitcast(ExtVT, Amt); Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); Amt = DAG.getBitcast(VT, Amt); if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { // r = VSELECT(r, shift(r, 4), a); SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); R = SignBitSelect(VT, Amt, M, R); return R; } if (Op->getOpcode() == ISD::SRA) { // For SRA we need to unpack each byte to the higher byte of a i16 vector // so we can correctly sign extend. We don't care what happens to the // lower byte. SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); // r = VSELECT(r, shift(r, 4), a); SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, DAG.getConstant(4, dl, ExtVT)); SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, DAG.getConstant(4, dl, ExtVT)); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 2), a); MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, DAG.getConstant(2, dl, ExtVT)); MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, DAG.getConstant(2, dl, ExtVT)); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 1), a); MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, DAG.getConstant(1, dl, ExtVT)); MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, DAG.getConstant(1, dl, ExtVT)); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // Logical shift the result back to the lower byte, leaving a zero upper // byte // meaning that we can safely pack with PACKUSWB. RLo = DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); RHi = DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } } // It's worth extending once and using the v8i32 shifts for 16-bit types, but // the extra overheads to get from v16i8 to v8i32 make the existing SSE // solution better. if (Subtarget->hasInt256() && VT == MVT::v8i16) { MVT ExtVT = MVT::v8i32; unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, ExtVT, R); Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } if (VT == MVT::v8i16) { unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. if (Subtarget->hasSSE41()) { MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); return DAG.getBitcast( VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; if (Subtarget->hasSSE41()) { // On SSE41 targets we need to replicate the shift mask in both // bytes for PBLENDVB. Amt = DAG.getNode( ISD::OR, dl, VT, DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); } else { Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); } // r = VSELECT(r, shift(r, 8), a); SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 4), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); R = SignBitSelect(Amt, M, R); return R; } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors SDValue V1 = Extract128BitVector(R, 0, DAG, dl); SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); // Recreate the shift amount vectors SDValue Amt1, Amt2; if (Amt.getOpcode() == ISD::BUILD_VECTOR) { // Constant shift amount SmallVector Ops(Amt->op_begin(), Amt->op_begin() + NumElems); ArrayRef Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); ArrayRef Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); } else { // Variable shift amount Amt1 = Extract128BitVector(Amt, 0, DAG, dl); Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); } // Issue new vector shifts for the smaller types V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); // Concatenate the result back return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); } return SDValue(); } static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); assert(VT.isVector() && "Custom lowering only for vector rotates!"); assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right. // Split 256-bit integers. if (VT.is256BitVector()) return Lower256IntArith(Op, DAG); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. if (auto *BVAmt = dyn_cast(Amt)) { if (auto *RotateConst = BVAmt->getConstantSplatNode()) { uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); return DAG.getNode(X86ISD::VPROTI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } } // Use general rotate by variable (per-element). return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" // has only one use. SDNode *N = Op.getNode(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; unsigned Cond = 0; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; } BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; case ISD::UADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_B; break; case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; } BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; case ISD::USUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_B; break; case ISD::SMULO: BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs if (N->getValueType(0) == MVT::i8) { BaseOp = X86ISD::UMUL8; Cond = X86::COND_O; break; } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(X86::COND_O, DL, MVT::i32), SDValue(Sum.getNode(), 2)); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), DAG.getConstant(Cond, DL, MVT::i32), SDValue(Sum.getNode(), 1)); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b else if (OpWidth == 128) return Subtarget->hasCmpxchg16b(); else return false; } bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { return needsCmpXchgNb(SI->getValueOperand()->getType()); } // Note: this turns large loads into lock cmpxchg8b/16b. // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { auto PTy = cast(LI->getPointerOperand()->getType()); return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Xchg: case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. return !AI->use_empty() ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; } } static bool hasMFENCE(const X86Subtarget& Subtarget) { // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for // no-sse2). There isn't any reason to disable it if the target processor // supports it. return Subtarget.hasSSE2() || Subtarget.is64Bit(); } LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SynchScope = AI->getSynchScope(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); auto Ptr = AI->getPointerOperand(); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence // is required: // Thread 0: // x.store(1, relaxed); // r1 = y.fetch_add(0, release); // Thread 1: // y.fetch_add(42, acquire); // r2 = x.load(relaxed); // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isAtLeastRelease(Order) but it is not clear // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SynchScope == SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; if (!hasMFENCE(*Subtarget)) // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare // enough that we do not bother. return nullptr; Function *MFence = llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SynchScope); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast( cast(Op.getOperand(1))->getZExtValue()); SynchronizationScope FenceScope = static_cast( cast(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::ESP, MVT::i32), // Base DAG.getTargetConstant(1, dl, MVT::i8), // Scale DAG.getRegister(0, MVT::i32), // Index DAG.getTargetConstant(0, dl, MVT::i32), // Disp DAG.getRegister(0, MVT::i32), // Segment. Zero, Chain }; SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); return SDValue(Res, 0); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT T = Op.getSimpleValueType(); SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.SimpleTy) { default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; case MVT::i64: assert(Subtarget->is64Bit() && "Node not type legal!"); Reg = X86::RAX; size = 8; break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), DAG.getConstant(X86::COND_E, DL, MVT::i8), EFLAGS); DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); SDValue Op0 = Op->getOperand(0); SmallVector Elts; SDLoc dl(Op); unsigned NumElts; MVT SVT; if (SrcVT.isVector()) { NumElts = SrcVT.getVectorNumElements(); SVT = SrcVT.getVectorElementType(); // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, DAG.getIntPtrConstant(i, dl))); } else { assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && "Unexpected source type in LowerBITCAST"); Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, DAG.getIntPtrConstant(0, dl))); Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, DAG.getIntPtrConstant(1, dl))); NumElts = 2; SVT = MVT::i32; } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, DAG.getIntPtrConstant(0, dl)); } assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"); // i64 <=> MMX conversions are Legal. if (SrcVT==MVT::i64 && DstVT.isVector()) return Op; if (DstVT==MVT::i64 && SrcVT.isVector()) return Op; // MMX <=> MMX conversions are Legal. if (SrcVT.isVector() && DstVT.isVector()) return Op; // All other conversions need to be expanded. return SDValue(); } /// Compute the horizontal sum of bytes in V for the elements of VT. /// /// Requires V to be a byte vector and VT to be an integer vector type with /// wider elements than V's type. The width of the elements of VT determines /// how many bytes of V are summed horizontally to produce each element of the /// result. static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(V); MVT ByteVecVT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); int NumElts = VT.getVectorNumElements(); assert(ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."); assert(EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"); unsigned VecSize = VT.getSizeInBits(); assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); // PSADBW instruction horizontally add all bytes and leave the result in i64 // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } if (EltVT == MVT::i32) { // We unpack the low half and high half into i32s interleaved with zeros so // that we can use PSADBW to horizontally sum them. The most useful part of // this is that it lines up the results of two PSADBW instructions to be // two v2i64 vectors which concatenated are the 4 population counts. We can // then use PACKUSWB to shrink and concatenate them into a v4i32 again. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros); SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros); // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, DAG.getBitcast(ShortVecVT, Low), DAG.getBitcast(ShortVecVT, High)); return DAG.getBitcast(VT, V); } // The only element type left is i16. assert(EltVT == MVT::i16 && "Unknown how to handle type"); // To obtain pop count for each i16 element starting from the pop count for // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s // right by 8. It is important to shift as i16s as i8 vector shift isn't // directly supported. SmallVector Shifters(NumElts, DAG.getConstant(8, DL, EltVT)); SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter); V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), DAG.getBitcast(ByteVecVT, V)); return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter); } static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned VecSize = VT.getSizeInBits(); // Implement a lookup table in register by using an algorithm based on: // http://wm.ite.pl/articles/sse-popcount.html // // The general idea is that every lower byte nibble in the input vector is an // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and // masked out higher ones) for each byte. PSHUB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. // // To obtain the pop count for elements != i8, we follow up with the same // approach and use additional tricks as described below. // const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; int NumByteElts = VecSize / 8; MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); SDValue In = DAG.getBitcast(ByteVecVT, Op); SmallVector LUTVec; for (int i = 0; i < NumByteElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); SmallVector Mask0F(NumByteElts, DAG.getConstant(0x0F, DL, MVT::i8)); SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); // High nibbles SmallVector Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); // Low nibbles SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); // The input vector is used as the shuffle mask that index elements into the // LUT. After counting low and high nibbles, add the vector to obtain the // final pop count per i8 element. SDValue HighPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); SDValue LowPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); if (EltVT == MVT::i8) return PopCnt; return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); } static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported."); int VecSize = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); int Len = EltVT.getSizeInBits(); // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // with a minor tweak to use a series of adds + shifts instead of vector // multiplications. Implemented for all integer vector types. We only use // this when we don't have SSSE3 which allows a LUT-based lowering that is // much faster, even faster than using native popcnt instructions. auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { MVT VT = V.getSimpleValueType(); SmallVector Shifters( VT.getVectorNumElements(), DAG.getConstant(Shifter, DL, VT.getVectorElementType())); return DAG.getNode(OpCode, DL, VT, V, DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters)); }; auto GetMask = [&](SDValue V, APInt Mask) { MVT VT = V.getSimpleValueType(); SmallVector Masks( VT.getVectorNumElements(), DAG.getConstant(Mask, DL, VT.getVectorElementType())); return DAG.getNode(ISD::AND, DL, VT, V, DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks)); }; // We don't want to incur the implicit masks required to SRL vNi8 vectors on // x86, so set the SRL type to have elements at least i16 wide. This is // correct because all of our SRLs are followed immediately by a mask anyways // that handles any bits that sneak into the high bits of the byte elements. MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); SDValue V = Op; // v = v - ((v >> 1) & 0x55555555...) SDValue Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); V = DAG.getNode(ISD::SUB, DL, VT, V, And); // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); // v = (v + (v >> 4)) & 0x0F0F0F0F... Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); // At this point, V contains the byte-wise population count, and we are // merely doing a horizontal sum if necessary to get the wider element // counts. if (EltVT == MVT::i8) return V; return LowerHorizontalByteSum( DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, DAG); } static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // FIXME: Need to add AVX-512 support here! assert((VT.is256BitVector() || VT.is128BitVector()) && "Unknown CTPOP type to handle"); SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); if (!Subtarget->hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); } if (VT.is256BitVector() && !Subtarget->hasInt256()) { unsigned NumElems = VT.getVectorNumElements(); // Extract each 128-bit vector, compute pop count and concat the result. SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL); SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); } return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); EVT T = Node->getValueType(0); SDValue negOp = DAG.getNode(ISD::SUB, dl, T, DAG.getConstant(0, dl, T), Node->getOperand(2)); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), negOp, cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); EVT VT = cast(Node)->getMemoryVT(); // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) // FIXME: On 32-bit, store -> fist or movq would be more efficient // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. if (cast(Node)->getOrdering() == SequentiallyConsistent || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); return Swap.getValue(1); } // Other atomic stores have a simple pattern. return Op; } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getNode()->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = X86ISD::ADD; break; case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; case ISD::SUBC: Opc = X86ISD::SUB; break; case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; } if (!ExtraOp) return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. return CallResult.first; // Returned in bits 0:31 and 32:64 xmm0. SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(0, dl)); SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(1, dl)); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes = false) { // Check if InOp already has the right width. MVT InVT = InOp.getSimpleValueType(); if (InVT == NVT) return InOp; if (InOp.isUndef()) return DAG.getUNDEF(NVT); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); unsigned InNumElts = InVT.getVectorNumElements(); unsigned WidenNumElts = NVT.getVectorNumElements(); assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); EVT EltVT = NVT.getVectorElementType(); SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { SDValue N1 = InOp.getOperand(1); if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || N1.isUndef()) { InOp = InOp.getOperand(0); InVT = InOp.getSimpleValueType(); InNumElts = InVT.getVectorNumElements(); } } if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { SmallVector Ops; for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) Ops.push_back(FillVal); return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); } SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); // X86 scatter kills mask register, so its type should be added to // the list of return values. // If the "scatter" has 2 return values, it is already handled. if (Op.getNode()->getNumValues() == 2) return Op; MaskedScatterSDNode *N = cast(Op.getNode()); SDValue Src = N->getValue(); MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); SDValue NewScatter; SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); MVT MemVT = N->getMemoryVT().getSimpleVT(); MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { // The v2i32 value was promoted to v2i64. // Now we "redo" the type legalizer's work and widen the original // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 // with a shuffle. assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && "Unexpected memory type"); int ShuffleMask[] = {0, 2, -1, -1}; Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), DAG.getUNDEF(MVT::v4i32), ShuffleMask); // Now we have 4 elements instead of 2. // Expand the index. MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); Index = ExtendToType(Index, NewIndexVT, DAG); // Expand the mask with zeroes // Mask may be <2 x i64> or <2 x i1> at this moment assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && "Unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); VT = MVT::v4i32; } unsigned NumElts = VT.getVectorNumElements(); if (!Subtarget->hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but // the vector contains 8 elements, we just sign-extend the index if (IndexVT == MVT::v8i32) // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); else { // The minimal number of elts in scatter is 8 NumElts = 8; // Index MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); // Use original index here, do not modify the index twice Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); if (IndexVT.getScalarType() == MVT::i32) Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); // Mask // At this point we have promoted mask operand assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); // Use the original mask here, do not modify the mask twice Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); // The value that should be stored MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src = ExtendToType(Src, NewVT, DAG); } } // If the mask is "wide" at this point - truncate it to i1 vector MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); // The mask is killed by scatter, add it to the values SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, N->getMemOperand()); DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 0); } static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MaskedLoadSDNode *N = cast(Op.getNode()); MVT VT = Op.getSimpleValueType(); SDValue Mask = N->getMask(); SDLoc dl(Op); if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); SDValue Src0 = N->getSrc0(); Src0 = ExtendToType(Src0, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), N->getMemOperand(), N->getExtensionType()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), DAG.getIntPtrConstant(0, dl)); SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } return Op; } static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MaskedStoreSDNode *N = cast(Op.getNode()); SDValue DataToStore = N->getValue(); MVT VT = DataToStore.getSimpleValueType(); SDValue Mask = N->getMask(); SDLoc dl(Op); if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), N->isTruncatingStore()); } return Op; } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Src0 = N->getValue(); MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); if (!Subtarget->hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but // the vector contains 8 elements, we just sign-extend the index if (NumElts == 8) { Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), Index }; DAG.UpdateNodeOperands(N, Ops); return Op; } // Minimal number of elements in Gather NumElts = 8; // Index MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); Index = ExtendToType(Index, NewIndexVT, DAG); if (IndexVT.getScalarType() == MVT::i32) Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); // Mask MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); // At this point we have promoted mask operand assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); // The pass-thru value MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src0 = ExtendToType(Src0, NewVT, DAG); SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), N->getMemoryVT(), dl, Ops, N->getMemOperand()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewGather.getValue(0), DAG.getIntPtrConstant(0, dl)); SDValue RetOps[] = {Exract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } return Op; } SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); return NOOP; } SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); return NOOP; } /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SETCCE: return LowerSETCCE(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: return LowerGC_TRANSITION_START(Op, DAG); case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); } } /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); case X86ISD::AVG: { // Legalize types for X86ISD::AVG by expanding vectors. assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); auto InVT = N->getValueType(0); auto InVTSize = InVT.getSizeInBits(); const unsigned RegSize = (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; assert((!Subtarget->hasAVX512() || RegSize < 512) && "512-bit vector requires AVX512"); assert((!Subtarget->hasAVX2() || RegSize < 256) && "256-bit vector requires AVX2"); auto ElemVT = InVT.getVectorElementType(); auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, RegSize / ElemVT.getSizeInBits()); assert(RegSize % InVT.getSizeInBits() == 0); unsigned NumConcat = RegSize / InVT.getSizeInBits(); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = N->getOperand(0); SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); Ops[0] = N->getOperand(1); SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, DAG.getIntPtrConstant(0, dl))); return; } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(1), UNDEF); Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); return; } case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: // We don't want to expand or promote these. return; case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: case ISD::SDIVREM: case ISD::UDIVREM: { SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); Results.push_back(V); return; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; std::pair Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode()) { EVT VT = N->getValueType(0); // Return a load from the stack slot. if (StackSlot.getNode()) Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo(), false, false, false, 0)); else Results.push_back(FIST); } return; } case ISD::UINT_TO_FP: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (N->getOperand(0).getValueType() != MVT::v2i32 || N->getValueType(0) != MVT::v2f32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N->getOperand(0)); SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; } case ISD::FP_ROUND: { if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) return; SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); Results.push_back(V); return; } case ISD::FP_EXTEND: { // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. // No other ValueType for FP_EXTEND should reach this point. assert(N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"); return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } case ISD::INTRINSIC_WO_CHAIN: { if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) Results.push_back(V); return; } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, dl, HalfT)); cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(1, dl, HalfT)); cpInL = DAG.getCopyToReg(N->getOperand(0), dl, Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX, cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(0, dl, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(1, dl, HalfT)); swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, swapInL, cpInH.getValue(1)); swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, swapInL.getValue(1)); SDValue Ops[] = { swapInH.getValue(0), N->getOperand(1), swapInH.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(N)->getMemOperand(); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); SDValue Success = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(Success); Results.push_back(EFLAGS.getValue(1)); return; } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; } case ISD::BITCAST: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0)->getValueType(0); if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) return; unsigned NumElts = DstVT.getVectorNumElements(); EVT SVT = DstVT.getVectorElementType(); EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); if (ExperimentalVectorWideningLegalization) { // If we are legalizing vectors by widening, we already have the desired // legal vector type, just return it. Results.push_back(ToVecInt); return; } SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, ToVecInt, DAG.getIntPtrConstant(i, dl))); Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((X86ISD::NodeType)Opcode) { case X86ISD::FIRST_NUMBER: break; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::ADDUS: return "X86ISD::ADDUS"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::ABS: return "X86ISD::ABS"; case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; case X86ISD::VSRA: return "X86ISD::VSRA"; case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; case X86ISD::SMUL8: return "X86ISD::SMUL8"; case X86ISD::UMUL8: return "X86ISD::UMUL8"; case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::SHUF128: return "X86ISD::SHUF128"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; case X86ISD::SFENCE: return "X86ISD::SFENCE"; case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::VPROT: return "X86ISD::VPROT"; case X86ISD::VPROTI: return "X86ISD::VPROTI"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; case X86ISD::SELECT: return "X86ISD::SELECT"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; case X86ISD::RCP28: return "X86ISD::RCP28"; case X86ISD::EXP2: return "X86ISD::EXP2"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; } return nullptr; } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { unsigned GVFlags = Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); // If a reference to this global requires an extra load, we can't fold it. if (isGlobalStubReference(GVFlags)) return false; // If BaseGV requires a register for the PIC base, we cannot also have a // BaseReg specified. if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) return false; // If lower 4G is not available, then we must use rip-relative addressing. if ((M != CodeModel::Small || R != Reloc::Static) && Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } switch (AM.Scale) { case 0: case 1: case 2: case 4: case 8: // These scales always work. break; case 3: case 5: case 9: // These scales are formed with basereg+scalereg. Only accept if there is // no basereg yet. if (AM.HasBaseReg) return false; break; default: // Other stuff never works. return false; } return true; } bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { unsigned Bits = Ty->getScalarSizeInBits(); // 8-bit shifts are always expensive, but versions with a scalar amount aren't // particularly cheaper than those without. if (Bits == 8) return false; // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make // variable shifts just as cheap as scalar ones. if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a // fully general vector. return true; } bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { // Can also use sub to handle negated immediates. return isInt<32>(Imm); } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); } bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) return true; if (Val.getOpcode() != ISD::LOAD) return false; if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i8: case MVT::i16: case MVT::i32: // X86 has 8, 16, and 32-bit zero-extending loads. return true; } return false; } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); } /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { if (!VT.isSimple()) return false; // Not for i1 vectors if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can // handle any possible shuffle mask that results. return isTypeLegal(VT.getSimpleVT()); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, EVT VT) const { // Just delegate to the generic legality, clear masks aren't special. return isShuffleMaskLegal(Mask, VT); } //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // // thisMBB: // xbegin sinkMBB // // mainMBB: // eax = -1 // // sinkMBB: // v = eax MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: // xbegin sinkMBB // # fallthrough to mainMBB // # abortion to sinkMBB BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(sinkMBB); // mainMBB: // EAX = -1 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); mainMBB->addSuccessor(sinkMBB); // sinkMBB: // EAX is live into the sinkMBB sinkMBB->addLiveIn(X86::EAX); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::EAX); MI->eraseFromParent(); return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; } DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); unsigned NumArgs = MI->getNumOperands(); for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI->getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } if (MI->hasOneMemOperand()) MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::XMM0); MI->eraseFromParent(); return BB; } // FIXME: Custom handling because TableGen doesn't support multiple implicit // defs in an instruction pattern static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; } DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); unsigned NumArgs = MI->getNumOperands(); // remove the results for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI->getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } if (MI->hasOneMemOperand()) MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::ECX); MI->eraseFromParent(); return BB; } static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); // insert input VAL into EAX BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) .addReg(MI->getOperand(0).getReg()); // insert zero to ECX BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) .addReg(X86::ECX) .addReg(X86::ECX); // insert zero to EDX BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) .addReg(X86::EDX) .addReg(X86::EDX); // insert WRPKRU instruction BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); MI->eraseFromParent(); // The pseudo is gone now. return BB; } static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); // insert zero to ECX BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) .addReg(X86::ECX) .addReg(X86::ECX); // insert RDPKRU instruction BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::EAX); MI->eraseFromParent(); // The pseudo is gone now. return BB; } static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) MIB.addOperand(MI->getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) .addReg(MI->getOperand(ValOps).getReg()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) .addReg(MI->getOperand(ValOps+1).getReg()); // The instruction doesn't actually take any operands though. BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); MI->eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: // 0 ) Output : destination address (reg) // 1-5) Input : va_list address (addr, i64mem) // 6 ) ArgSize : Size (in bytes) of vararg type // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); static_assert(X86::AddrNumOperands == 5, "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI->getOperand(0).getReg(); MachineOperand &Base = MI->getOperand(1); MachineOperand &Scale = MI->getOperand(2); MachineOperand &Index = MI->getOperand(3); MachineOperand &Disp = MI->getOperand(4); MachineOperand &Segment = MI->getOperand(5); unsigned ArgSize = MI->getOperand(6).getImm(); unsigned ArgMode = MI->getOperand(7).getImm(); unsigned Align = MI->getOperand(8).getImm(); // Memory Reference assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); DebugLoc DL = MI->getDebugLoc(); // struct va_list { // i32 gp_offset // i32 fp_offset // i64 overflow_area (address) // i64 reg_save_area (address) // } // sizeof(va_list) = 24 // alignment(va_list) = 8 unsigned TotalNumIntRegs = 6; unsigned TotalNumXMMRegs = 8; bool UseGPOffset = (ArgMode == 1); bool UseFPOffset = (ArgMode == 2); unsigned MaxOffset = TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; bool NeedsAlign = (Align > 8); MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *overflowMBB; MachineBasicBlock *offsetMBB; MachineBasicBlock *endMBB; unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB unsigned OffsetReg = 0; if (!UseGPOffset && !UseFPOffset) { // If we only pull from the overflow region, we don't create a branch. // We don't need to alter control flow. OffsetDestReg = 0; // unused OverflowDestReg = DestReg; offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { // First emit code to check if gp_offset (or fp_offset) is below the bound. // If so, pull the argument from reg_save_area. (branch to offsetMBB) // If not, pull from overflow_area. (branch to overflowMBB) // // thisMBB // | . // | . // offsetMBB overflowMBB // | . // | . // endMBB // Registers for the PHI in endMBB OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *MF = MBB->getParent(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); MF->insert(MBBIter, overflowMBB); MF->insert(MBBIter, endMBB); // Transfer the remainder of MBB and its successor edges to endMBB. endMBB->splice(endMBB->begin(), thisMBB, std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Make offsetMBB and overflowMBB successors of thisMBB thisMBB->addSuccessor(offsetMBB); thisMBB->addSuccessor(overflowMBB); // endMBB is a successor of both offsetMBB and overflowMBB offsetMBB->addSuccessor(endMBB); overflowMBB->addSuccessor(endMBB); // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .addOperand(Segment) .setMemRefs(MMOBegin, MMOEnd); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) .addReg(OffsetReg) .addImm(MaxOffset + 8 - ArgSizeA8); // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) .addMBB(overflowMBB); } // In offsetMBB, emit code to use the reg_save_area. if (offsetMBB) { assert(OffsetReg != 0); // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, 16) .addOperand(Segment) .setMemRefs(MMOBegin, MMOEnd); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); // Add the offset to the reg_save_area to get the final address. BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) .addReg(OffsetReg64) .addReg(RegSaveReg); // Compute the offset for the next argument unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .addOperand(Segment) .addReg(NextOffsetReg) .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } // // Emit code to use overflow area // // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, 8) .addOperand(Segment) .setMemRefs(MMOBegin, MMOEnd); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) .addReg(OverflowAddrReg) .addImm(Align-1); BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) .addReg(TmpReg) .addImm(~(uint64_t)(Align-1)); } else { BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) .addReg(OverflowAddrReg); } // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) .addOperand(Base) .addOperand(Scale) .addOperand(Index) .addDisp(Disp, 8) .addOperand(Segment) .addReg(NextAddrReg) .setMemRefs(MMOBegin, MMOEnd); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { BuildMI(*endMBB, endMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(OffsetDestReg).addMBB(offsetMBB) .addReg(OverflowDestReg).addMBB(overflowMBB); } // Erase the pseudo instruction MI->eraseFromParent(); return endMBB; } MachineBasicBlock * X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachineInstr *MI, MachineBasicBlock *MBB) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, // however this code takes a simpler approach and just executes all // of the stores if %al is non-zero. It's less code, and it's probably // easier on the hardware branch predictor, and stores aren't all that // expensive anyway. // Create the new basic blocks. One block contains all the XMM stores, // and one block is the final destination regardless of whether any // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); MachineFunction::iterator MBBIter = ++MBB->getIterator(); MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); F->insert(MBBIter, EndMBB); // Transfer the remainder of MBB and its successor edges to EndMBB. EndMBB->splice(EndMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndMBB->transferSuccessorsAndUpdatePHIs(MBB); // The original block will now fall through to the XMM save block. MBB->addSuccessor(XMMSaveMBB); // The XMMSaveMBB will fall through to the end block. XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". assert((MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) .addFrameIndex(RegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) .addImm(/*Disp=*/Offset) .addReg(/*Segment=*/0) .addReg(MI->getOperand(i).getReg()) .addMemOperand(MMO); } MI->eraseFromParent(); // The pseudo instruction is gone now. return EndMBB; } // The EFLAGS operand of SelectItr might be missing a kill marker // because there were multiple uses of EFLAGS, and ISel didn't know // which to mark. Figure out whether SelectItr should have had a // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { // Scan forward through BB for a use/def of EFLAGS. MachineBasicBlock::iterator miI(std::next(SelectItr)); for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { const MachineInstr& mi = *miI; if (mi.readsRegister(X86::EFLAGS)) return false; if (mi.definesRegister(X86::EFLAGS)) break; // Should have kill-flag - update below. } // If we hit the end of the block, check whether EFLAGS is live into a // successor. if (miI == BB->end()) { for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), sEnd = BB->succ_end(); sItr != sEnd; ++sItr) { MachineBasicBlock* succ = *sItr; if (succ->isLiveIn(X86::EFLAGS)) return false; } } // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. SelectItr->addRegisterKilled(X86::EFLAGS, TRI); return true; } // Return true if it is OK for this CMOV pseudo-opcode to be cascaded // together with other CMOV pseudo-opcodes into a single basic-block with // conditional jump around it. static bool isCMOVPseudo(MachineInstr *MI) { switch (MI->getOpcode()) { case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return true; default: return false; } } MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); // This code lowers all pseudo-CMOV instructions. Generally it lowers these // as described above, by inserting a BB, and then making a PHI at the join // point to select the true and false operands of the CMOV in the PHI. // // The code also handles two different cases of multiple CMOV opcodes // in a row. // // Case 1: // In this case, there are multiple CMOVs in a row, all which are based on // the same condition setting (or the exact opposite condition setting). // In this case we can lower all the CMOVs using a single inserted BB, and // then make a number of PHIs at the join point to model the CMOVs. The only // trickiness here, is that in a case like: // // t2 = CMOV cond1 t1, f1 // t3 = CMOV cond1 t2, f2 // // when rewriting this into PHIs, we have to perform some renaming on the // temps since you cannot have a PHI operand refer to a PHI result earlier // in the same block. The "simple" but wrong lowering would be: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t2(BB1), f2(BB2) // // but clearly t2 is not defined in BB1, so that is incorrect. The proper // renaming is to note that on the path through BB1, t2 is really just a // copy of t1, and do that renaming, properly generating: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t1(BB1), f2(BB2) // // Case 2, we lower cascaded CMOVs such as // // (CMOV (CMOV F, T, cc1), T, cc2) // // to two successives branches. For that, we look for another CMOV as the // following instruction. // // Without this, we would add a PHI between the two jumps, which ends up // creating a few copies all around. For instance, for // // (sitofp (zext (fcmp une))) // // we would generate: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // movaps %xmm0, %xmm1 // jne .LBB5_2 // xorps %xmm1, %xmm1 // .LBB5_2: // jp .LBB5_4 // movaps %xmm1, %xmm0 // .LBB5_4: // retq // // because this custom-inserter would have generated: // // A // | \ // | B // | / // C // | \ // | D // | / // E // // A: X = ...; Y = ... // B: empty // C: Z = PHI [X, A], [Y, B] // D: empty // E: PHI [X, C], [Z, D] // // If we lower both CMOVs in a single step, we can instead generate: // // A // | \ // | C // | /| // |/ | // | | // | D // | / // E // // A: X = ...; Y = ... // D: empty // E: PHI [X, A], [X, C], [Y, D] // // Which, in our sitofp/fcmp example, gives us something like: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // jne .LBB5_4 // jp .LBB5_4 // xorps %xmm0, %xmm0 // .LBB5_4: // retq // MachineInstr *CascadedCMOV = nullptr; MachineInstr *LastCMOV = MI; X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { // See if we have a string of CMOVS with the same condition. while (NextMIIt != BB->end() && isCMOVPseudo(NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; ++NextMIIt; } } // This checks for case 2, but only do this if we didn't already find // case 1, as indicated by LastCMOV == MI. if (LastCMOV == MI && NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { CascadedCMOV = &*NextMIIt; } MachineBasicBlock *jcc1MBB = nullptr; // If we have a cascaded CMOV, we lower it to two successive branches to // the same block. EFLAGS is used by both, so mark it as live in the second. if (CascadedCMOV) { jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, jcc1MBB); jcc1MBB->addLiveIn(X86::EFLAGS); } MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. if (CascadedCMOV) { // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. BB->addSuccessor(jcc1MBB); // In that case, jcc1MBB will itself fallthrough the copy0MBB, and // jump to the sinkMBB. jcc1MBB->addSuccessor(copy0MBB); jcc1MBB->addSuccessor(sinkMBB); } else { BB->addSuccessor(copy0MBB); } // The true block target of the first (or only) branch is always sinkMBB. BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); if (CascadedCMOV) { unsigned Opc2 = X86::GetCondBranchFromCond( (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); } // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB copy0MBB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = std::next(MachineBasicBlock::iterator(LastCMOV)); MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); DenseMap> RegRewriteTable; MachineInstrBuilder MIB; // As we are creating the PHIs, we have to be careful if there is more than // one. Later CMOVs may reference the results of earlier CMOVs, but later // PHIs have to reference the individual true/false inputs from earlier PHIs. // That also means that PHI construction must work forward from earlier to // later, and that the code must maintain a mapping from earlier PHI's // destination registers, and the registers that went into the PHI. for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { unsigned DestReg = MIIt->getOperand(0).getReg(); unsigned Op1Reg = MIIt->getOperand(1).getReg(); unsigned Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the // PHI that is going to be generated. if (MIIt->getOperand(3).getImm() == OppCC) std::swap(Op1Reg, Op2Reg); if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) Op1Reg = RegRewriteTable[Op1Reg].first; if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) Op2Reg = RegRewriteTable[Op2Reg].second; MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg).addMBB(copy0MBB) .addReg(Op2Reg).addMBB(thisMBB); // Add this PHI to the rewrite table. RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); } // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). if (CascadedCMOV) { MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL, TII->get(TargetOpcode::COPY), CascadedCMOV->getOperand(0).getReg()) .addReg(MI->getOperand(0).getReg()); CascadedCMOV->eraseFromParent(); } // Now remove the CMOV(s). for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) (MIIt++)->eraseFromParent(); return sinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, MachineBasicBlock *BB) const { // Combine the following atomic floating-point modification pattern: // a.store(reg OP a.load(acquire), release) // Transform them into: // OPss (%gpr), %xmm // movss %xmm, (%gpr) // Or sd equivalent for 64-bit operations. unsigned MOp, FOp; switch (MI->getOpcode()) { default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; } const X86InstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); MachineOperand MSrc = MI->getOperand(0); unsigned VSrc = MI->getOperand(5).getReg(); const MachineOperand &Disp = MI->getOperand(3); MachineOperand ZeroDisp = MachineOperand::CreateImm(0); bool hasDisp = Disp.isGlobal() || Disp.isImm(); if (hasDisp && MSrc.isReg()) MSrc.setIsKill(false); MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) .addOperand(/*Base=*/MSrc) .addImm(/*Scale=*/1) .addReg(/*Index=*/0) .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) .addReg(0); MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), MRI.createVirtualRegister(MRI.getRegClass(VSrc))) .addReg(VSrc) .addOperand(/*Base=*/MSrc) .addImm(/*Scale=*/1) .addReg(/*Index=*/0) .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) .addReg(/*Segment=*/0); MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); const bool Is64Bit = Subtarget->is64Bit(); const bool IsLP64 = Subtarget->isTarget64BitLP64(); const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] // If stacklet is not large enough, jump to mallocMBB // // bumpMBB: // Allocate by subtracting from RSP // Jump to continueMBB // // mallocMBB: // Allocate by call to runtime // // continueMBB: // ... // [rest of original BB] // MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MF->getDataLayout())); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); MF->insert(MBBIter, continueMBB); continueMBB->splice(continueMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); continueMBB->transferSuccessorsAndUpdatePHIs(BB); // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EDI, RegState::Implicit) .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); BB->addSuccessor(mallocMBB); mallocMBB->addSuccessor(continueMBB); bumpMBB->addSuccessor(continueMBB); // Take care of the PHI nodes. BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) .addReg(mallocPtrVReg).addMBB(mallocMBB) .addReg(bumpSPPtrVReg).addMBB(bumpMBB); // Delete the original pseudo instruction. MI->eraseFromParent(); // And we're done. return continueMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { assert(!Subtarget->isTargetMachO()); DebugLoc DL = MI->getDebugLoc(); MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( *BB->getParent(), *BB, MI, DL, false); MachineBasicBlock *ResumeBB = ResumeMI->getParent(); MI->eraseFromParent(); // The pseudo instruction is gone now. return ResumeBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); DebugLoc DL = MI->getDebugLoc(); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. if (!Subtarget->is32Bit()) return BB; // C++ EH creates a new target block to hold the restore code, and wires up // the new block to the return destination with a normal JMP_4. MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); assert(BB->succ_size() == 1); MF->insert(std::next(BB->getIterator()), RestoreMBB); RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RestoreMBB); MI->getOperand(0).setMBB(RestoreMBB); auto RestoreMBBI = RestoreMBB->begin(); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const Constant *PerFn = MF->getFunction()->getPersonalityFn(); bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); // Only 32-bit SEH requires special handling for catchpad. if (IsSEH && Subtarget->is32Bit()) { const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); } MI->eraseFromParent(); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = Subtarget->is64Bit() ? Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) .addReg(X86::RIP) .addImm(0).addReg(0) .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) .addImm(0).addReg(0) .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(TII->getGlobalBaseReg(F)) .addImm(0).addReg(0) .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); unsigned DstReg; unsigned MemOpndSlot = 0; unsigned CurOp = 0; DstReg = MI->getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(RC->hasType(MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: // v_main = 0 // // sinkMBB: // v = phi(main, restore) // // restoreMBB: // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); restoreMBB->setHasAddressTaken(); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); Reloc::Model RM = MF->getTarget().getRelocationModel(); bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); // Prepare IP either in reg or imm. if (!UseImmLabel) { PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; const TargetRegisterClass *PtrRC = getRegClassFor(PVT); LabelReg = MRI.createVirtualRegister(PtrRC); if (Subtarget->is64Bit()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(restoreMBB) .addReg(0); } else { const X86InstrInfo *XII = static_cast(TII); MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) .addReg(XII->getGlobalBaseReg(MF)) .addImm(0) .addReg(0) .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) .addReg(0); } } else PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; // Store IP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); else MIB.addOperand(MI->getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); else MIB.addMBB(restoreMBB); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); // mainMBB: // EAX = 0 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { const bool Uses64BitFramePtr = Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo(); X86FI->setRestoreBasePointer(MF); unsigned FramePtr = RegInfo->getFrameRegister(*MF); unsigned BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI->eraseFromParent(); return sinkMBB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) MIB.addOperand(MI->getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI->getOperand(i), LabelOffset); else MIB.addOperand(MI->getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI->getOperand(i), SPOffset); else MIB.addOperand(MI->getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); MI->eraseFromParent(); return MBB; } // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer // to remove extra copies in the loop. // FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineOperand &AddendOp = MI->getOperand(3); // Bail out early if the addend isn't a register - we can't switch these. if (!AddendOp.isReg()) return MBB; MachineFunction &MF = *MBB->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); // Check whether the addend is defined by a PHI: assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); if (!AddendDef.isPHI()) return MBB; // Look for the following pattern: // loop: // %addend = phi [%entry, 0], [%loop, %result] // ... // %result = FMA213 %m2, %m1, %addend // Replace with: // loop: // %addend = phi [%entry, 0], [%loop, %result] // ... // %result = FMA231 %addend, %m1, %m2 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { assert(AddendDef.getOperand(i).isReg()); MachineOperand PHISrcOp = AddendDef.getOperand(i); MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); if (&PHISrcInst == MI) { // Found a matching instruction. unsigned NewFMAOpc = 0; switch (MI->getOpcode()) { case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; default: llvm_unreachable("Unrecognized FMA variant."); } const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(3)) .addOperand(MI->getOperand(2)) .addOperand(MI->getOperand(1)); MBB->insert(MachineBasicBlock::iterator(MI), MIB); MI->eraseFromParent(); } } return MBB; } MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: case X86::TAILJMPd64_REX: case X86::TAILJMPr64_REX: case X86::TAILJMPm64_REX: llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_FR128: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); case X86::RDFLAGS32: case X86::RDFLAGS64: { DebugLoc DL = MI->getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); unsigned PushF = MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; unsigned Pop = MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; BuildMI(*BB, MI, DL, TII->get(PushF)); BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); MI->eraseFromParent(); // The pseudo is gone now. return BB; } case X86::WRFLAGS32: case X86::WRFLAGS64: { DebugLoc DL = MI->getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); unsigned Push = MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; unsigned PopF = MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); BuildMI(*BB, MI, DL, TII->get(PopF)); MI->eraseFromParent(); // The pseudo is gone now. return BB; } case X86::RELEASE_FADD32mr: case X86::RELEASE_FADD64mr: return EmitLoweredAtomicFP(MI, BB); case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: case X86::FP64_TO_INT16_IN_MEM: case X86::FP64_TO_INT32_IN_MEM: case X86::FP64_TO_INT64_IN_MEM: case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); // Load the old value of the high byte of the control word... unsigned OldCW = F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); // Set the high part to be round to zero... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) .addImm(0xC7F); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); // Restore the memory image of control word to original value addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) .addReg(OldCW); // Get the X86 opcode to use. unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; } X86AddressMode AM; MachineOperand &Op = MI->getOperand(0); if (Op.isReg()) { AM.BaseType = X86AddressMode::RegBase; AM.Base.Reg = Op.getReg(); } else { AM.BaseType = X86AddressMode::FrameIndexBase; AM.Base.FrameIndex = Op.getIndex(); } Op = MI->getOperand(1); if (Op.isImm()) AM.Scale = Op.getImm(); Op = MI->getOperand(2); if (Op.isImm()) AM.IndexReg = Op.getImm(); Op = MI->getOperand(3); if (Op.isGlobal()) { AM.GV = Op.getGlobal(); } else { AM.Disp = Op.getImm(); } addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } // String/text processing lowering. case X86::PCMPISTRM128REG: case X86::VPCMPISTRM128REG: case X86::PCMPISTRM128MEM: case X86::VPCMPISTRM128MEM: case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: case X86::PCMPESTRM128MEM: case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: case X86::VPCMPISTRIREG: case X86::PCMPISTRIMEM: case X86::VPCMPISTRIMEM: case X86::PCMPESTRIREG: case X86::VPCMPESTRIREG: case X86::PCMPESTRIMEM: case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, Subtarget); // PKU feature case X86::WRPKRU: return EmitWRPKRU(MI, BB, Subtarget); case X86::RDPKRU: return EmitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: return emitEHSjLjSetJmp(MI, BB); case X86::EH_SjLj_LongJmp32: case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); case TargetOpcode::STATEPOINT: // As an implementation detail, STATEPOINT shares the STACKMAP format at // this point in the process. We diverge later. return emitPatchPoint(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case X86::VFMADDPDr213r: case X86::VFMADDPSr213r: case X86::VFMADDSDr213r: case X86::VFMADDSSr213r: case X86::VFMSUBPDr213r: case X86::VFMSUBPSr213r: case X86::VFMSUBSDr213r: case X86::VFMSUBSSr213r: case X86::VFNMADDPDr213r: case X86::VFNMADDPSr213r: case X86::VFNMADDSDr213r: case X86::VFNMADDSSr213r: case X86::VFNMSUBPDr213r: case X86::VFNMSUBPSr213r: case X86::VFNMSUBSDr213r: case X86::VFNMSUBSSr213r: case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPSr213r: case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPSr213r: case X86::VFMADDPDr213rY: case X86::VFMADDPSr213rY: case X86::VFMSUBPDr213rY: case X86::VFMSUBPSr213rY: case X86::VFNMADDPDr213rY: case X86::VFNMADDPSr213rY: case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPSr213rY: case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPSr213rY: case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); } } //===----------------------------------------------------------------------===// // X86 Optimization Hooks //===----------------------------------------------------------------------===// void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. switch (Opc) { default: break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::ADC: case X86ISD::SBB: case X86ISD::SMUL: case X86ISD::UMUL: case X86ISD::INC: case X86ISD::DEC: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: // These nodes' second result is a boolean. if (Op.getResNo() == 0) break; // Fallthrough case X86ISD::SETCC: KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntId = cast(Op.getOperand(0))->getZExtValue(); unsigned NumLoBits = 0; switch (IntId) { default: break; case Intrinsic::x86_sse_movmsk_ps: case Intrinsic::x86_avx_movmsk_ps_256: case Intrinsic::x86_sse2_movmsk_pd: case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_mmx_pmovmskb: case Intrinsic::x86_sse2_pmovmskb_128: case Intrinsic::x86_avx2_pmovmskb: { // High bits of movmskp{s|d}, pmovmskb are known zero. switch (IntId) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; } KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } break; } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const SelectionDAG &, unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) return Op.getValueType().getScalarSizeInBits(); // Fallback case. return 1; } /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the /// node is a GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const { if (N->getOpcode() == X86ISD::Wrapper) { if (isa(N->getOperand(0))) { GA = cast(N->getOperand(0))->getGlobal(); Offset = cast(N->getOperand(0))->getOffset(); return true; } } return TargetLowering::isGAPlusOffset(N, GA, Offset); } /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. /// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { SDLoc dl(N); ShuffleVectorSDNode *SVOp = cast(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && V2.getOpcode() == ISD::CONCAT_VECTORS) { // // 0,0,0,... // | // V UNDEF BUILD_VECTOR UNDEF // \ / \ / // CONCAT_VECTOR CONCAT_VECTOR // \ / // \ / // RESULT: V + zero extended // if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || V2.getOperand(1).getOpcode() != ISD::UNDEF || V1.getOperand(1).getOpcode() != ISD::UNDEF) return SDValue(); if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) return SDValue(); // To match the shuffle mask, the first half of the mask should // be exactly the first vector, and all the rest a splat with the // first element of the second one. for (unsigned i = 0; i != NumElems/2; ++i) if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. if (LoadSDNode *Ld = dyn_cast(V1.getOperand(0))) { if (Ld->hasNUsesOfValue(1, 0)) { SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); // Make sure the newly-created LOAD is in the same position as Ld in // terms of dependency. We create a TokenFactor for Ld and ResNode, // and update uses of Ld's output chain to use the TokenFactor. if (Ld->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); } return DAG.getBitcast(VT, ResNode); } } // Emit a zeroed vector and insert the desired subvector on its // first half. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); return DCI.CombineTo(N, InsV); } return SDValue(); } /// \brief Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// /// This is the leaf of the recursive combinine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, int Depth, bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); // Find the operand that enters the chain. Note that multiple uses are OK // here, we're not going to remove the operand we find. SDValue Input = Op.getOperand(0); while (Input.getOpcode() == ISD::BITCAST) Input = Input.getOperand(0); MVT VT = Input.getSimpleValueType(); MVT RootVT = Root.getSimpleValueType(); SDLoc DL(Root); if (Mask.size() == 1) { int Index = Mask[0]; assert((Index >= 0 || Index == SM_SentinelUndef || Index == SM_SentinelZero) && "Invalid shuffle index found!"); // We may end up with an accumulated mask of size 1 as a result of // widening of shuffle operands (see function canWidenShuffleElements). // If the only shuffle index is equal to SM_SentinelZero then propagate // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle // mask, and therefore the entire chain of shuffles can be folded away. if (Index == SM_SentinelZero) DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); else DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), /*AddTo*/ true); return true; } // Use the float domain if the operand type is a floating point type. bool FloatDomain = VT.isFloatingPoint(); // For floating point shuffles, we don't have free copies in the shuffle // instructions or the ability to load as part of the instruction, so // canonicalize their shuffles to UNPCK or MOV variants. // // Note that even with AVX we prefer the PSHUFD form of shuffle for integer // vectors because it can have a load folded into it that UNPCK cannot. This // doesn't preclude something switching to the shorter encoding post-RA. // // FIXME: Should teach these routines about AVX vector widths. if (FloatDomain && VT.is128BitVector()) { if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { bool Lo = Mask.equals({0, 0}); unsigned Shuffle; MVT ShuffleVT; // Check if we have SSE3 which will let us use MOVDDUP. That instruction // is no slower than UNPCKLPD but has the option to fold the input operand // into even an unaligned memory load. if (Lo && Subtarget->hasSSE3()) { Shuffle = X86ISD::MOVDDUP; ShuffleVT = MVT::v2f64; } else { // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller // than the UNPCK variants. Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; } if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); if (Shuffle == X86ISD::MOVDDUP) Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); else Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } if (Subtarget->hasSSE3() && (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { bool Lo = Mask.equals({0, 0, 2, 2}); unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { bool Lo = Mask.equals({0, 0, 1, 1}); unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } } // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. if (!FloatDomain && VT.is128BitVector() && (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || Mask.equals( {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { bool Lo = Mask[0] == 0; unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! MVT ShuffleVT; switch (Mask.size()) { case 8: ShuffleVT = MVT::v8i16; break; case 16: ShuffleVT = MVT::v16i8; break; default: llvm_unreachable("Impossible mask size!"); }; Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) return false; // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we // can replace them with a single PSHUFB instruction profitably. Intel's // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but // in practice PSHUFB tends to be *very* fast so we're more aggressive. if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector PSHUFBMask; int NumBytes = VT.getSizeInBits() / 8; int Ratio = NumBytes / Mask.size(); for (int i = 0; i < NumBytes; ++i) { if (Mask[i / Ratio] == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } int M = Mask[i / Ratio] != SM_SentinelZero ? Ratio * Mask[i / Ratio] + i % Ratio : 255; PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); Op = DAG.getBitcast(ByteVT, Input); DCI.AddToWorklist(Op.getNode()); SDValue PSHUFBMaskOp = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } // Failed to find any combines. return false; } /// \brief Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once /// they have been fully optimized, this will recursively consider all chains /// of single-use shuffle instructions, build a generic model of the cumulative /// shuffle operation, and check for simpler instructions which implement this /// operation. We use this primarily for two purposes: /// /// 1) Collapse generic shuffles to specialized single instructions when /// equivalent. In most cases, this is just an encoding size win, but /// sometimes we will collapse multiple generic shuffles into a single /// special-purpose shuffle. /// 2) Look for sequences of shuffle instructions with 3 or more total /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with /// a suitable short sequence of other instructions. The PHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// /// Because this is inherently a quadratic operation (for each shuffle in /// a chain, we recurse up the chain), the depth is limited to 8 instructions. /// This should never be an issue in practice as the shuffle lowering doesn't /// produce sequences of more than 8 instructions. /// /// FIXME: We will currently miss some cases where the redundant shuffling /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, ArrayRef RootMask, int Depth, bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. if (Depth > 8) return false; // Directly rip through bitcasts to find the underlying operand. while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) Op = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return false; // Bail if we hit a non-vector. assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && "Can only combine shuffles of the same vector register size."); if (!isTargetShuffle(Op.getOpcode())) return false; SmallVector OpMask; bool IsUnary; bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; assert(VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); int RootRatio = std::max(1, OpMask.size() / RootMask.size()); int OpRatio = std::max(1, RootMask.size() / OpMask.size()); assert(((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"); SmallVector Mask; Mask.reserve(std::max(OpMask.size(), RootMask.size())); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { int RootIdx = i / RootRatio; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. Mask.push_back(RootMask[RootIdx]); continue; } int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; int OpIdx = RootMaskedIdx / OpRatio; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. Mask.push_back(OpMask[OpIdx]); continue; } // Ok, we have non-zero lanes, map them through. Mask.push_back(OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio); } // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { case X86ISD::PSHUFB: HasPSHUFB = true; case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: if (Op.getOperand(0).hasOneUse() && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, HasPSHUFB, DAG, DCI, Subtarget)) return true; break; case X86ISD::UNPCKL: case X86ISD::UNPCKH: assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); // We can't check for single use, we have to check that this shuffle is the // only user. if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, HasPSHUFB, DAG, DCI, Subtarget)) return true; break; } // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with squential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. SmallVector WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); WidenedMask.clear(); } return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, Subtarget); } /// \brief Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; bool IsUnary; bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); (void)HaveMask; assert(HaveMask); // If we have more than 128-bits, only the low 128-bits of shuffle mask // matter. Check that the upper masks are repeats and remove them. if (VT.getSizeInBits() > 128) { int LaneElts = 128 / VT.getScalarSizeInBits(); #ifndef NDEBUG for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) for (int j = 0; j < LaneElts; ++j) assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"); #endif Mask.resize(LaneElts); } switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; case X86ISD::PSHUFLW: Mask.resize(4); return Mask; case X86ISD::PSHUFHW: Mask.erase(Mask.begin(), Mask.begin() + 4); for (int &M : Mask) M -= 4; return Mask; default: llvm_unreachable("No valid shuffle instruction found!"); } } /// \brief Search for a combinable shuffle across a chain ending in pshufd. /// /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); // Walk up a single-use chain looking for a combinable shuffle. Keep a stack // of the shuffles in the chain so that we can form a fresh chain to replace // this one. SmallVector Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFD: // Found another dword shuffle. break; case X86ISD::PSHUFLW: // Check that the low words (being shuffled) are the identity in the // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) return SDValue(); Chain.push_back(V); continue; case X86ISD::PSHUFHW: // Check that the high words (being shuffled) are the identity in the // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) return SDValue(); Chain.push_back(V); continue; case X86ISD::UNPCKL: case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) return SDValue(); Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; Chain.push_back(V); // Fallthrough! case ISD::BITCAST: V = V.getOperand(0); continue; } break; } while (V.hasOneUse()); break; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return SDValue(); // Merge this node's mask and our incoming mask. SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Rebuild the chain around this new shuffle. while (!Chain.empty()) { SDValue W = Chain.pop_back_val(); if (V.getValueType() != W.getOperand(0).getValueType()) V = DAG.getBitcast(W.getOperand(0).getValueType(), V); switch (W.getOpcode()) { default: llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); case X86ISD::UNPCKL: case X86ISD::UNPCKH: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); break; case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); break; } } if (V.getValueType() != N.getValueType()) V = DAG.getBitcast(N.getValueType(), V); // Return the new chain to replace N. return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or /// pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking /// through shuffles which switch halves trying to find a shuffle of the same /// pair of dwords. static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert( (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); unsigned CombineOpcode = N.getOpcode(); // Walk up a single-use chain looking for a combinable shuffle. SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return false; // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOpcode) break; // Other-half shuffles are no-ops. continue; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return false; // Combine away the bottom node as its shuffle will be accumulated into // a preceding shuffle. DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); // Record the old value. SDValue Old = V; // Merge this node's mask and our incoming mask (adjusted to account for all // the pshufd instructions encountered). SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Check that the shuffles didn't cancel each other out. If not, we need to // combine to the new one. if (Old != V) // Replace the combinable shuffle with the combined one, updating all users // so that we re-evaluate the chain here. DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); return true; } /// \brief Try to combine x86 target specific shuffles. static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); MVT VT = N.getSimpleValueType(); SmallVector Mask; switch (N.getOpcode()) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; case X86ISD::UNPCKL: { // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE // moves upper half elements into the lower half part. For example: // // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, // undef:v16i8 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 // // will be combined to: // // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not // happen due to advanced instructions. if (!VT.is128BitVector()) return SDValue(); auto Op0 = N.getOperand(0); auto Op1 = N.getOperand(1); if (Op0.getOpcode() == ISD::UNDEF && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { ArrayRef Mask = cast(Op1.getNode())->getMask(); unsigned NumElts = VT.getVectorNumElements(); SmallVector ExpectedMask(NumElts, -1); std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, NumElts / 2); auto ShufOp = Op1.getOperand(0); if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); } return SDValue(); } case X86ISD::BLENDI: { SDValue V0 = N->getOperand(0); SDValue V1 = N->getOperand(1); assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && "Unexpected input vector types"); // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector // operands and changing the mask to 1. This saves us a bunch of // pattern-matching possibilities related to scalar math ops in SSE/AVX. // x86InstrInfo knows how to commute this back after instruction selection // if it would help register allocation. // TODO: If optimizing for size or a processor that doesn't suffer from // partial register update stalls, this should be transformed into a MOVSD // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. if (VT == MVT::v2f64) if (auto *Mask = dyn_cast(N->getOperand(2))) if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); } return SDValue(); } default: return SDValue(); } // Nuke no-op shuffles that show up after combining. if (isNoopShuffleMask(Mask)) return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); // Look for simplifications involving one or two shuffle instructions. SDValue V = N.getOperand(0); switch (N.getOpcode()) { default: break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); V = DAG.getBitcast(DVT, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); DCI.AddToWorklist(V.getNode()); return DAG.getBitcast(VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. // FIXME: This doesn't handle the location of the PSHUFD generically, and // only works when we have a PSHUFD followed by two half-shuffles. if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && V.hasOneUse()) { SDValue D = V.getOperand(0); while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) D = D.getOperand(0); if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { SmallVector VMask = getPSHUFShuffleMask(V); SmallVector DMask = getPSHUFShuffleMask(D); int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int WordMask[8]; for (int i = 0; i < 4; ++i) { WordMask[i + NOffset] = Mask[i] + NOffset; WordMask[i + VOffset] = VMask[i] + VOffset; } // Map the word mask through the DWord mask. int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. V = DAG.getBitcast(VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V, V); } } } break; case X86ISD::PSHUFD: if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) return NewN; break; } return SDValue(); } /// \brief Try to combine a shuffle into a target-specific add-sub node. /// /// We combine this directly on the abstract vector shuffle nodes so it is /// easier to generically match. We also insert dummy vector shuffle nodes for /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); auto *SVN = cast(N); SmallVector Mask; for (int M : SVN->getMask()) Mask.push_back(M); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); // We require the first shuffle operand to be the FSUB node, and the second to // be the FADD node. if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) return SDValue(); // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) return SDValue(); // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) return SDValue(); // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT)) if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // // This code performs the following transformation: // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) -> // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, ) // // We do this only if both the bitcast and the BINOP dag nodes have // one use. Also, perform this transformation only if the new binary // operation is legal. This is to avoid introducing dag nodes that // potentially need to be further expanded (or custom lowered) into a // less optimal sequence of dag nodes. if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && N0.getOpcode() == ISD::BITCAST) { SDValue BC0 = N0.getOperand(0); EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { bool CanFold = false; switch (Opcode) { default : break; case ISD::ADD : case ISD::FADD : case ISD::SUB : case ISD::FSUB : case ISD::MUL : case ISD::FMUL : CanFold = true; } unsigned SVTNumElts = SVT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(N); for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) == (int)(i * 2); for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) < 0; if (CanFold) { SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); } } } // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) return LD; if (isTargetShuffle(N->getOpcode())) { SDValue Shuffle = PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); if (Shuffle.getNode()) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle // instructions into higher-order shuffles. We do this after combining // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } return SDValue(); } /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); EVT EltVT = N->getValueType(0); if (!isa(EltNo)) return SDValue(); EVT OriginalVT = InVec.getValueType(); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); if (!BCVT.isVector() || BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); } EVT CurrentVT = InVec.getValueType(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); SmallVector ShuffleMask; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; if (Idx == SM_SentinelZero) return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); if (Idx == SM_SentinelUndef) return DAG.getUNDEF(EltVT); assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = InVec.getNumOperands() > 1 && InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) return SDValue(); AllowedUses = 1; // only allow 1 load use if we have a bitcast LdNode = LdNode.getOperand(0); } if (!ISD::isNormalLoad(LdNode.getNode())) return SDValue(); LoadSDNode *LN0 = cast(LdNode); if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( EltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : InVec.getOperand(1); Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // Detect bitcasts between i32 to x86mmx low word. Since MMX types are // special and don't usually play with other vector types, it's better to // handle them early to be sure we emit efficient code by avoiding // store-load conversions. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getValueType() == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand and one constant operand into a floating-point // logic operation. This may create a load of the constant, but that is // cheaper than materializing the constant in an integer register and // transferring it to an SSE register or transferring the SSE operand to // integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); } if (((Subtarget->hasSSE1() && VT == MVT::f32) || (Subtarget->hasSSE2() && VT == MVT::f64)) && isa(N0.getOperand(1)) && N0.getOperand(0).getOpcode() == ISD::BITCAST && N0.getOperand(0).getOperand(0).getValueType() == VT) { SDValue N000 = N0.getOperand(0).getOperand(0); SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); } return SDValue(); } /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// into a somewhat faster sequence. For i686, the best sequence is apparently /// storing the value and loading scalars back, while for x64 we should /// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); SDLoc dl(InputVector); // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && N->getValueType(0) == MVT::i32 && InputVector.getValueType() == MVT::v2i32) { // The bitcast source is a direct mmx result. SDValue MMXSrc = InputVector.getNode()->getOperand(0); if (MMXSrc.getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), N->getValueType(0), InputVector.getNode()->getOperand(0)); // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && MMXSrc.getValueType() == MVT::i64) { SDValue MMXSrcOp = MMXSrc.getOperand(0); if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && MMXSrcOp.getValueType() == MVT::v1i64 && MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), N->getValueType(0), MMXSrcOp.getOperand(0)); } } EVT VT = N->getValueType(0); if (VT == MVT::i1 && isa(N->getOperand(1)) && InputVector.getOpcode() == ISD::BITCAST && isa(InputVector.getOperand(0))) { uint64_t ExtractedElt = cast(N->getOperand(1))->getZExtValue(); uint64_t InputValue = cast(InputVector.getOperand(0))->getZExtValue(); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) return SDValue(); // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a // single use which is a sign-extend or zero-extend, and all elements are // used. SmallVector Uses; unsigned ExtractedElements = 0; for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() != InputVector.getResNo()) return SDValue(); SDNode *Extract = *UI; if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); if (Extract->getValueType(0) != MVT::i32) return SDValue(); if (!Extract->hasOneUse()) return SDValue(); if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); if (!isa(Extract->getOperand(1))) return SDValue(); // Record which element was extracted. ExtractedElements |= 1 << cast(Extract->getOperand(1))->getZExtValue(); Uses.push_back(Extract); } // If not all the elements were used, this may not be worthwhile. if (ExtractedElements != 15) return SDValue(); // Ok, we've now decided to do the transformation. // If 64-bit shifts are legal, use the extract-shift sequence, // otherwise bounce the vector off the cache. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vals[4]; if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); auto &DL = DAG.getDataLayout(); EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(0, dl, VecIdxTy)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(1, dl, VecIdxTy)); SDValue ShAmt = DAG.getConstant( 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); } else { // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, MachinePointerInfo(), false, false, 0); EVT ElementType = InputVector.getValueType().getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; // Replace each use (extract) with a load of the appropriate element. for (unsigned i = 0; i < 4; ++i) { uint64_t Offset = EltSize * i; auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); // Load the scalar. Vals[i] = DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo(), false, false, false, 0); } } // Replace the extracts for (SmallVectorImpl::iterator UI = Uses.begin(), UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; SDValue Idx = Extract->getOperand(1); uint64_t IdxVal = cast(Idx)->getZExtValue(); DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. return SDValue(); } static SDValue transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); if (Cond.getOpcode() == ISD::SIGN_EXTEND) { SDValue CondSrc = Cond->getOperand(0); if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) Cond = CondSrc->getOperand(0); } if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); unsigned MaskValue = 0; if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) return SDValue(); MVT VT = N->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); SmallVector ShuffleMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { // Be sure we emit undef where we can. if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) ShuffleMask[i] = -1; else ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) return SDValue(); return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); } /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom xhasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); unsigned Opcode = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; break; case ISD::SETULE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMIN; break; case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETUGT: // Converting this to a max would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETUGE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMAX; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETOGE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; break; case ISD::SETUGE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMIN; break; case ISD::SETULT: // Converting this to a max would handle NaNs incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETOLE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETULE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMAX; break; } } if (Opcode) return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } EVT CondVT = Cond.getValueType(); if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. // The same situation for all 128 and 256-bit vectors of i8 and i16. // Since SKX these selects have a proper lowering. EVT OpVT = LHS.getValueType(); if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16) && !(Subtarget->hasBWI() && Subtarget->hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); } } // If this is a select between two integer constants, try to do some // optimizations. if (ConstantSDNode *TrueC = dyn_cast(LHS)) { if (ConstantSDNode *FalseC = dyn_cast(RHS)) // Don't do this for crazy integer types. if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { // If this is efficiently invertible, canonicalize the LHSC/RHSC values // so that TrueC (the true value) is larger than FalseC. bool NeedsCondInvert = false; if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && // Efficiently invertible. (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. isa(Cond.getOperand(1))))) { NeedsCondInvert = true; std::swap(TrueC, FalseC); } // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, DL, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, DAG.getConstant(ShAmt, DL, MVT::i8)); } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, DL, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; bool isFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, DL, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, DL, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } } } } // Canonicalize max and min: // (x > y) ? x : y -> (x >= y) ? x : y // (x < y) ? x : y -> (x <= y) ? x : y // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates // the need for an extra compare // against zero. e.g. // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 // subl %esi, %edi // testl %edi, %edi // movl $0, %eax // cmovgl %edi, %eax // => // xorl %eax, %eax // subl %esi, $edi // cmovsl %eax, %edi if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); switch (CC) { default: break; case ISD::SETLT: case ISD::SETGT: { ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); } } } // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); // Check if one of the arms of the VSELECT is a zero vector. If it's on the // left side invert the predicate to simplify logic below. SDValue Other; if (ISD::isBuildVectorAllZeros(LHS.getNode())) { Other = RHS; CC = ISD::getSetCCInverse(CC, true); } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { Other = LHS; } if (Other.getNode() && Other->getNumOperands() == 2 && DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); SDValue CondRHS = Cond->getOperand(1); // Look for a general sub with unsigned saturation first. // x >= y ? x-y : 0 --> subus x, y // x > y ? x-y : 0 --> subus x, y if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); if (auto *OpRHSBV = dyn_cast(OpRHS)) if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { if (auto *CondRHSBV = dyn_cast(CondRHS)) if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && CondRHSConst->getAPIntValue() == (-OpRHSConst->getAPIntValue() - 1)) return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT)); // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. // FIXME: Would it be better to use computeKnownBits to determine // whether it's safe to decanonicalize the xor? // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && OpRHSConst->getAPIntValue().isSignBit()) // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT)); } } } // Simplify vector selection if condition value type matches vselect // operand type if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); // Try invert the condition if true value is not all 1s and false value // is not all 0s. if (!TValIsAllOnes && !FValIsAllZeros && // Check if the selector will be produced by CMPP*/PCMP* Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); ISD::CondCode NewCC = ISD::getSetCCInverse(cast(CC)->get(), Cond.getOperand(0).getValueType().isInteger()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); TValIsAllOnes = FValIsAllOnes; FValIsAllZeros = TValIsAllZeros; } } if (TValIsAllOnes || FValIsAllZeros) { SDValue Ret; if (TValIsAllOnes && FValIsAllZeros) Ret = Cond; else if (TValIsAllOnes) Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS)); else if (FValIsAllZeros) Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS)); return DAG.getBitcast(VT, Ret); } } // We should generate an X86ISD::BLENDI from a vselect if its argument // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of // constants. This specific pattern gets generated when we split a // selector for a 512 bit vector in a machine without AVX512 (but with // 256-bit vectors), during legalization: // // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) // // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. if ((N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SHRUNKBLEND) && !DCI.isBeforeLegalize() && !VT.is512BitVector()) { SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; } // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the // condition so that the blends can use the high bit of each element and use // SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); // We can only handle the cases where VSELECT is directly legal on the // subtarget. We custom lower VSELECT nodes with constant conditions and // this makes it hard to see whether a dynamic VSELECT will correctly // lower, so we both check the operation's status and explicitly handle the // cases where a *dynamic* blend will fail even though a constant-condition // blend could be custom lowered. // FIXME: We should find a better way to handle this class of problems. // Potentially, we should combine constant-condition vselect nodes // pre-legalization into shuffles and not mark as many types as custom // lowered. if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. if (VT.is128BitVector() && !Subtarget->hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) { // If we changed the computation somewhere in the DAG, this change // will affect all users of Cond. // Make sure it is fine and update all the nodes so that we do not // use the generic VSELECT anymore. Otherwise, we may perform // wrong optimizations as we messed up with the actual expectation // for the vector boolean values. if (Cond != TLO.Old) { // Check all uses of that condition operand to check whether it will be // consumed by non-BLEND instructions, which may depend on all bits are // set properly. for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); I != E; ++I) if (I->getOpcode() != ISD::VSELECT) // TODO: Add other opcodes eventually lowered into BLEND. return SDValue(); // Update all the users of the condition, before committing the change, // so that the VSELECT optimizations that expect the correct vector // boolean value will not be triggered. for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); I != E; ++I) DAG.ReplaceAllUsesOfValueWith( SDValue(*I, 0), DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), Cond, I->getOperand(1), I->getOperand(2))); DCI.CommitTargetLoweringOpt(TLO); return SDValue(); } // At this point, only Cond is changed. Change the condition // just for N to keep the opportunity to optimize all other // users their own way. DAG.ReplaceAllUsesOfValueWith( SDValue(N, 0), DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), TLO.New, N->getOperand(1), N->getOperand(2))); return SDValue(); } } return SDValue(); } // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. // // Simplify the following patterns: // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) // to (Op EFLAGS Cond) // // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) // to (Op EFLAGS !Cond) // // where Op could be BRCOND or CMOV. // static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Quit if not CMP and SUB with its value result used. if (Cmp.getOpcode() != X86ISD::CMP && (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) return SDValue(); // Quit if not used as a boolean value. if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); // Check CMP operands. One of them should be 0 or 1 and the other should be // an SetCC or extended from it. SDValue Op1 = Cmp.getOperand(0); SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast(Op1))) SetCC = Op2; else if ((C = dyn_cast(Op2))) SetCC = Op1; else // Quit if all operands are not constants. return SDValue(); if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; checkAgainstTrue = true; } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); bool truncatedToBoolWithAnd = false; // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx == -1) break; SetCC = SetCC.getOperand(OpIdx); truncatedToBoolWithAnd = true; } else SetCC = SetCC.getOperand(0); } switch (SetCC.getOpcode()) { case X86ISD::SETCC_CARRY: // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, // i.e. it's a comparison against true but the result of SETCC_CARRY is not // truncated to i1 using 'and'. if (checkAgainstTrue && !truncatedToBoolWithAnd) break; assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!"); // FALL THROUGH case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(1); case X86ISD::CMOV: { // Check whether false/true value has canonical one, i.e. 0 or 1. ConstantSDNode *FVal = dyn_cast(SetCC.getOperand(0)); ConstantSDNode *TVal = dyn_cast(SetCC.getOperand(1)); // Quit if true value is not a constant. if (!TVal) return SDValue(); // Quit if false value is not a constant. if (!FVal) { SDValue Op = SetCC.getOperand(0); // Skip 'zext' or 'trunc' node. if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE) Op = Op.getOperand(0); // A special case for rdrand/rdseed, where 0 is set if false cond is // found. if ((Op.getOpcode() != X86ISD::RDRAND && Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. bool FValIsFalse = true; if (FVal && FVal->getZExtValue() != 0) { if (FVal->getZExtValue() != 1) return SDValue(); // If FVal is 1, opposite cond is needed. needOppositeCond = !needOppositeCond; FValIsFalse = false; } // Quit if TVal is not the constant opposite of FVal. if (FValIsFalse && TVal->getZExtValue() != 1) return SDValue(); if (!FValIsFalse && TVal->getZExtValue() != 0) return SDValue(); CC = X86::CondCode(SetCC.getConstantOperandVal(2)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(3); } } return SDValue(); } /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. /// Match: /// (X86or (X86setcc) (X86setcc)) /// (X86cmp (and (X86setcc) (X86setcc)), 0) static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); } isAnd = false; SDValue SetCC0, SetCC1; switch (Cond->getOpcode()) { default: return false; case ISD::AND: case X86ISD::AND: isAnd = true; // fallthru case ISD::OR: case X86ISD::OR: SetCC0 = Cond->getOperand(0); SetCC1 = Cond->getOperand(1); break; }; // Make sure we have SETCC nodes, using the same flags value. if (SetCC0.getOpcode() != X86ISD::SETCC || SetCC1.getOpcode() != X86ISD::SETCC || SetCC0->getOperand(1) != SetCC1->getOperand(1)) return false; CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); Flags = SetCC0->getOperand(1); return true; } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); // If the flag operand isn't dead, don't touch this CMOV. if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) return SDValue(); SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); if (CC == X86::COND_E || CC == X86::COND_NE) { switch (Cond.getOpcode()) { default: break; case X86ISD::BSR: case X86ISD::BSF: // If operand of BSR / BSF are proven never zero, then ZF cannot be set. if (DAG.isKnownNeverZero(Cond.getOperand(0))) return (CC == X86::COND_E) ? FalseOp : TrueOp; } } SDValue Flags; Flags = checkBoolTestSetCCCombine(Cond, CC); if (Flags.getNode() && // Extra check as FCMOV only supports a subset of X86 cond. (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); } // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. if (ConstantSDNode *TrueC = dyn_cast(TrueOp)) { if (ConstantSDNode *FalseC = dyn_cast(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, DL, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, DAG.getConstant(ShAmt, DL, MVT::i8)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, DL, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; bool isFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, DL, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, DL, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } } } } // Handle these cases: // (select (x != c), e, c) -> select (x != c), e, x), // (select (x == c), c, e) -> select (x == c), x, e) // where the c is an integer constant, and the "select" is the combination // of CMOV and CMP. // // The rationale for this change is that the conditional-move from a constant // needs two instructions, however, conditional-move from a register needs // only one instruction. // // CAVEAT: By replacing a constant with a symbolic value, it may obscure // some instruction-combining opportunities. This opt needs to be // postponed as late as possible. // if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && !isa(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast(FalseOp)) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueOp, FalseOp); } if (CC == X86::COND_E && CmpAgainst == dyn_cast(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, DL, MVT::i8), Cond }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); } } } // Fold and/or of setcc's to double CMOV: // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) // // This combine lets us generate: // cmovcc1 (jcc1 if we don't have CMOV) // cmovcc2 (same) // instead of: // setcc1 // setcc2 // and/or // cmovne (jne if we don't have CMOV) // When we can't use the CMOV instruction, it might increase branch // mispredicts. // When we can use CMOV, or when there is no mispredict, this improves // throughput and reduces register pressure. // if (CC == X86::COND_NE) { SDValue Flags; X86::CondCode CC0, CC1; bool isAndSetCC; if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { if (isAndSetCC) { std::swap(FalseOp, TrueOp); CC0 = X86::GetOppositeBranchCondition(CC0); CC1 = X86::GetOppositeBranchCondition(CC1); } SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), Flags}; SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); return CMOV; } } return SDValue(); } /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); uint64_t MulAmt = C->getZExtValue(); if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) return SDValue(); uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; if ((MulAmt % 9) == 0) { MulAmt1 = 9; MulAmt2 = MulAmt / 9; } else if ((MulAmt % 5) == 0) { MulAmt1 = 5; MulAmt2 = MulAmt / 5; } else if ((MulAmt % 3) == 0) { MulAmt1 = 3; MulAmt2 = MulAmt / 3; } SDLoc DL(N); SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) // If second multiplifer is pow2, issue it first. We want the multiply by // 3, 5, or 9 to be folded into the addressing mode unless the lone use // is an add. std::swap(MulAmt1, MulAmt2); if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(MulAmt1, DL, VT)); if (isPowerOf2_64(MulAmt2)) NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); } if (!NewMul) { assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && "Both cases that could cause potential overflows should have " "already been handled."); if (isPowerOf2_64(MulAmt - 1)) // (mul x, 2^N + 1) => (add (shl x, N), x) NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt - 1), DL, MVT::i8))); else if (isPowerOf2_64(MulAmt + 1)) // (mul x, 2^N - 1) => (sub (shl x, N), x) NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt + 1), DL, MVT::i8)), N->getOperand(0)); } if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); return SDValue(); } static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); APInt ShAmt = N1C->getAPIntValue(); Mask = Mask.shl(ShAmt); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if // we carefully interrogate the mask to make sure we are semantics // preserving. // The transform is not safe if the result of C1 << C2 exceeds the bitwidth // of the underlying setcc_c operation if the setcc_c was zero extended. // Consider the following example: // zext(setcc_c) -> i32 0x0000FFFF // c1 -> i32 0x0000FFFF // c2 -> i32 0x00000001 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE if (N00.getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if (N00.getOpcode() == ISD::SIGN_EXTEND && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || N00.getOpcode() == ISD::ANY_EXTEND) && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); } if (MaskOK && Mask != 0) { SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } // Hardware support for vector shifts is sparse which makes us scalarize the // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V if (auto *N1BV = dyn_cast(N1)) if (auto *N1SplatC = N1BV->getConstantSplatNode()) { assert(N0.getValueType().isVector() && "Invalid vector shift type"); // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. if (N1SplatC->getAPIntValue() == 1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } return SDValue(); } static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned Size = VT.getSizeInBits(); // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) // depending on sign of (SarConst - [56,48,32,24,16]) // sexts in X86 are MOVs. The MOVs have the same code size // as above SHIFTs (only SHIFT on 1 has lower code size). // However the MOVs have 2 advantages to a SHIFT: // 1. MOVs can write to a register that differs from source // 2. MOVs accept memory operands if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || N0.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); APInt ShlConst = (cast(N01))->getAPIntValue(); APInt SarConst = (cast(N1))->getAPIntValue(); EVT CVT = N1.getValueType(); if (SarConst.isNegative()) return SDValue(); for (MVT SVT : MVT::integer_valuetypes()) { unsigned ShiftSize = SVT.getSizeInBits(); // skipping types without corresponding sext/zext and // ShlConst that is not one of [56,48,32,24,16] if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) continue; SDLoc DL(N); SDValue NN = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); SarConst = SarConst - (Size - ShiftSize); if (SarConst == 0) return NN; else if (SarConst.isNegative()) return DAG.getNode(ISD::SHL, DL, VT, NN, DAG.getConstant(-SarConst, DL, CVT)); else return DAG.getNode(ISD::SRA, DL, VT, NN, DAG.getConstant(SarConst, DL, CVT)); } return SDValue(); } /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && (!Subtarget->hasInt256() || (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); SDValue Amt = N->getOperand(1); SDLoc DL(N); if (auto *AmtBV = dyn_cast(Amt)) if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { APInt ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getSimpleVT().getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s // if the shift amount is bigger than or equal to // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); } /// PerformShiftCombine - Combine shifts. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (N->getOpcode() == ISD::SHL) if (SDValue V = PerformSHLCombine(N, DAG)) return V; if (N->getOpcode() == ISD::SRA) if (SDValue V = PerformSRACombine(N, DAG)) return V; // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) return V; return SDValue(); } // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) // where both setccs reference the same FP CMP, and rewrite for CMPEQSS // and friends. Likewise for OR -> CMPNEQSS. static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { unsigned opcode; // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); SDValue CMP1 = N1->getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) return SDValue(); SDValue CMP00 = CMP0->getOperand(0); SDValue CMP01 = CMP0->getOperand(1); EVT VT = CMP00.getValueType(); if (VT == MVT::f32 || VT == MVT::f64) { bool ExpectingFlags = false; // Check for any users that want flags: for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); !ExpectingFlags && UI != UE; ++UI) switch (UI->getOpcode()) { default: case ISD::BR_CC: case ISD::BRCOND: case ISD::SELECT: ExpectingFlags = true; break; case ISD::CopyToReg: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; } if (!ExpectingFlags) { enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { X86::CondCode tmp = cc0; cc0 = cc1; cc1 = tmp; } if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget->hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); if (N->getValueType(0) != MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), FSetCC); return FSetCC; } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; if (is64BitFP && !Subtarget->is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since // OnesOrZeroesF is all ones of all zeroes, we don't need all the // bits, but can do this little dance to extract the lowest 32 bits // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, OnesOrZeroesF); SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32, DAG.getIntPtrConstant(0, DL)); IntVT = MVT::i32; } SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, DL, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } } } } return SDValue(); } /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector /// so it can be folded inside ANDNP. static bool CanFoldXORWithAllOnes(const SDNode *N) { EVT VT = N->getValueType(0); // Match direct AllOnes for 128 and 256-bit vectors if (ISD::isBuildVectorAllOnes(N)) return true; // Look through a bit convert. if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); // Sometimes the operand may come from a insert_subvector building a 256-bit // allones vector if (VT.is256BitVector() && N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && V1.getOperand(0).getOpcode() == ISD::UNDEF && ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && ISD::isBuildVectorAllOnes(V2.getNode())) return true; } return false; } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.is256BitVector()) return SDValue(); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N->getOperand(0); EVT NarrowVT = Narrow->getValueType(0); if (!NarrowVT.is128BitVector()) return SDValue(); if (Narrow->getOpcode() != ISD::XOR && Narrow->getOpcode() != ISD::AND && Narrow->getOpcode() != ISD::OR) return SDValue(); SDValue N0 = Narrow->getOperand(0); SDValue N1 = Narrow->getOperand(1); SDLoc DL(Narrow); // The Left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) return SDValue(); // The type of the truncated inputs. EVT WideVT = N0->getOperand(0)->getValueType(0); if (WideVT != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; ConstantSDNode *RHSConstSplat = nullptr; if (auto *RHSBV = dyn_cast(N1)) RHSConstSplat = RHSBV->getConstantSplatNode(); if (!RHSTrunc && !RHSConstSplat) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), SDValue(RHSConstSplat, 0)); SmallVector C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { N1 = N1->getOperand(0); } // Generate the wide operation. SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); unsigned Opcode = N->getOpcode(); switch (Opcode) { case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: { unsigned InBits = NarrowVT.getScalarSizeInBits(); APInt Mask = APInt::getAllOnesValue(InBits); Mask = Mask.zext(VT.getScalarSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(Mask, DL, VT)); } case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); default: llvm_unreachable("Unexpected opcode"); } } static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // A vector zext_in_reg may be represented as a shuffle, // feeding into a bitcast (this represents anyext) feeding into // an and with a mask. // We'd like to try to combine that into a shuffle with zero // plus a bitcast, removing the and. if (N0.getOpcode() != ISD::BITCAST || N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); // The other side of the AND should be a splat of 2^C, where C // is the number of bits in the source type. if (N1.getOpcode() == ISD::BITCAST) N1 = N1.getOperand(0); if (N1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); BuildVectorSDNode *Vector = cast(N1); ShuffleVectorSDNode *Shuffle = cast(N0.getOperand(0)); EVT SrcType = Shuffle->getValueType(0); // We expect a single-source shuffle if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) return SDValue(); unsigned SrcSize = SrcType.getScalarSizeInBits(); APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!Vector->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs)) return SDValue(); unsigned ResSize = N1.getValueType().getScalarSizeInBits(); // Make sure the splat matches the mask we expect if (SplatBitSize > ResSize || (SplatValue + 1).exactLogBase2() != (int)SrcSize) return SDValue(); // Make sure the input and output size make sense if (SrcSize >= ResSize || ResSize % SrcSize) return SDValue(); // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> // The number of u's between each two values depends on the ratio between // the source and dest type. unsigned ZextRatio = ResSize / SrcSize; bool IsZext = true; for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { if (i % ZextRatio) { if (Shuffle->getMaskElt(i) > 0) { // Expected undef IsZext = false; break; } } else { if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { // Expected element number IsZext = false; break; } } } if (!IsZext) return SDValue(); // Ok, perform the transformation - replace the shuffle with // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero // (instead of undef) where the k elements come from the zero vector. SmallVector Mask; unsigned NumElems = SrcType.getVectorNumElements(); for (unsigned i = 0; i < NumElems; ++i) if (i % ZextRatio) Mask.push_back(NumElems); else Mask.push_back(i / ZextRatio); SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); return DAG.getBitcast(N0.getValueType(), NewShuffle); } /// If both input operands of a logic op are being cast from floating point /// types, try to convert this into a floating point logic node to avoid /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { unsigned FPOpcode = ISD::DELETED_NODE; if (N->getOpcode() == ISD::AND) FPOpcode = X86ISD::FAND; else if (N->getOpcode() == ISD::OR) FPOpcode = X86ISD::FOR; else if (N->getOpcode() == ISD::XOR) FPOpcode = X86ISD::FXOR; assert(FPOpcode != ISD::DELETED_NODE && "Unexpected input node for FP logic conversion"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && ((Subtarget->hasSSE1() && VT == MVT::i32) || (Subtarget->hasSSE2() && VT == MVT::i64))) { SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); EVT N00Type = N00.getValueType(); EVT N10Type = N10.getValueType(); if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); return DAG.getBitcast(VT, FPLogic); } } return SDValue(); } static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) return Zext; if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { ConstantSDNode *MaskNode = dyn_cast(N1); ConstantSDNode *ShiftNode = dyn_cast(N0.getOperand(1)); if (MaskNode && ShiftNode) { uint64_t Mask = MaskNode->getZExtValue(); uint64_t Shift = ShiftNode->getZExtValue(); if (isMask_64(Mask)) { uint64_t MaskSize = countPopulation(Mask); if (Shift + MaskSize <= VT.getSizeInBits()) return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), DAG.getConstant(Shift | (MaskSize << 8), DL, VT)); } } } // BEXTR return SDValue(); } // Want to form ANDNP nodes: // 1) In the hopes of then easily combining them with OR and AND nodes // to form PBLEND/PSIGN. // 2) To match ANDN packed intrinsics if (VT != MVT::v2i64 && VT != MVT::v4i64) return SDValue(); // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); // Check RHS for vnot if (N1.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); return SDValue(); } static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // look for psign/blend if (VT == MVT::v2i64 || VT == MVT::v4i64) { if (!Subtarget->hasSSSE3() || (VT == MVT::v4i64 && !Subtarget->hasInt256())) return SDValue(); // Canonicalize pandn to RHS if (N0.getOpcode() == X86ISD::ANDNP) std::swap(N0, N1); // or (and (m, y), (pandn m, x)) if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { SDValue Mask = N1.getOperand(0); SDValue X = N1.getOperand(1); SDValue Y; if (N0.getOperand(0) == Mask) Y = N0.getOperand(1); if (N0.getOperand(1) == Mask) Y = N0.getOperand(0); // Check to see if the mask appeared in both the AND and ANDNP and if (!Y.getNode()) return SDValue(); // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. // Look through mask bitcast. if (Mask.getOpcode() == ISD::BITCAST) Mask = Mask.getOperand(0); if (X.getOpcode() == ISD::BITCAST) X = X.getOperand(0); if (Y.getOpcode() == ISD::BITCAST) Y = Y.getOperand(0); EVT MaskVT = Mask.getValueType(); // Validate that the Mask operand is a vector sra node. // FIXME: what to do for bytes, since there is a psignb/pblendvb, but // there is no psrai.b unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { if (auto *AmtBV = dyn_cast(Mask.getOperand(1))) if (auto *AmtConst = AmtBV->getConstantSplatNode()) SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { SDValue SraC = Mask.getOperand(1); SraAmt = cast(SraC)->getZExtValue(); } if ((SraAmt + 1) != EltBits) return SDValue(); SDLoc DL(N); // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); return DAG.getBitcast(VT, Mask); } // PBLENDVB only available on SSE 4.1 if (!Subtarget->hasSSE41()) return SDValue(); MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } } if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. if (!OptForSize && Subtarget->isSHLDSlow()) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SDValue ShAmt0 = N0.getOperand(1); if (ShAmt0.getValueType() != MVT::i8) return SDValue(); SDValue ShAmt1 = N1.getOperand(1); if (ShAmt1.getValueType() != MVT::i8) return SDValue(); if (ShAmt0.getOpcode() == ISD::TRUNCATE) ShAmt0 = ShAmt0.getOperand(0); if (ShAmt1.getOpcode() == ISD::TRUNCATE) ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); unsigned Opc = X86ISD::SHLD; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); if (ShAmt0.getOpcode() == ISD::SUB) { Opc = X86ISD::SHRD; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); } unsigned Bits = VT.getSizeInBits(); if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (ConstantSDNode *SumC = dyn_cast(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) return DAG.getNode(Opc, DL, VT, Op0, Op1, DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } } else if (ConstantSDNode *ShAmt1C = dyn_cast(ShAmt1)) { ConstantSDNode *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } return SDValue(); } // Generate NEG and CMOV for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); // Since X86 does not have CMOV for 8-bit integer, we don't convert // 8-bit integer abs to NEG and CMOV. if (VT.isInteger() && VT.getSizeInBits() == 8) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CMOV. if (VT.isInteger() && N->getOpcode() == ISD::XOR && N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { // Generate SUB & CMOV. SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, VT), N0.getOperand(0)); SDValue Ops[] = { N0.getOperand(0), Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8), SDValue(Neg.getNode(), 1) }; return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); } return SDValue(); } // Try to turn tests against the signbit in the form of: // XOR(TRUNCATE(SRL(X, size(X)-1)), 1) // into: // SETGT(X, -1) static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // This is only worth doing if the output type is i8. if (N->getValueType(0) != MVT::i8) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // We should be performing an xor against a truncated shift. if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) return SDValue(); // Make sure we are performing an xor against one. if (!isOneConstant(N1)) return SDValue(); // SetCC on x86 zero extends so only act on this if it's a logical shift. SDValue Shift = N0.getOperand(0); if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) return SDValue(); // Make sure we are truncating from one of i16, i32 or i64. EVT ShiftTy = Shift.getValueType(); if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) return SDValue(); // Make sure the shift amount extracts the sign bit. if (!isa(Shift.getOperand(1)) || Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) return SDValue(); // Create a greater-than comparison against -1. // N.B. Using SETGE against 0 works but we want a canonical looking // comparison, using SETGT matches up with what TranslateX86CC. SDLoc DL(N); SDValue ShiftOp = Shift.getOperand(0); EVT ShiftOpTy = ShiftOp.getValueType(); SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); return Cond; } static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) return RV; if (Subtarget->hasCMov()) if (SDValue RV = performIntegerAbsCombine(N, DAG)) return RV; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; return SDValue(); } /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc DL) { if (!VT.isVector() || !VT.isSimple()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && isPowerOf2_32(NumElems))) return SDValue(); // InScalarVT is the intermediate type in AVG pattern and it should be greater // than the original input type (i8/i16). EVT InScalarVT = InVT.getVectorElementType(); if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) return SDValue(); if (Subtarget->hasAVX512()) { if (VT.getSizeInBits() > 512) return SDValue(); } else if (Subtarget->hasAVX2()) { if (VT.getSizeInBits() > 256) return SDValue(); } else { if (VT.getSizeInBits() > 128) return SDValue(); } // Detect the following pattern: // // %1 = zext %a to // %2 = zext %b to // %3 = add nuw nsw %1, // %4 = add nuw nsw %3, %2 // %5 = lshr %N, // %6 = trunc %5 to // // In AVX512, the last instruction can also be a trunc store. if (In.getOpcode() != ISD::SRL) return SDValue(); // A lambda checking the given SDValue is a constant vector and each element // is in the range [Min, Max]. auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { BuildVectorSDNode *BV = dyn_cast(V); if (!BV || !BV->isConstant()) return false; for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { ConstantSDNode *C = dyn_cast(V.getOperand(i)); if (!C) return false; uint64_t Val = C->getZExtValue(); if (Val < Min || Val > Max) return false; } return true; }; // Check if each element of the vector is left-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); if (!IsConstVectorInRange(RHS, 1, 1)) return SDValue(); if (LHS.getOpcode() != ISD::ADD) return SDValue(); // Detect a pattern of a + b + 1 where the order doesn't matter. SDValue Operands[3]; Operands[0] = LHS.getOperand(0); Operands[1] = LHS.getOperand(1); // Take care of the case when one of the operands is a constant vector whose // element is in the range [1, 256]. if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && Operands[0].getOpcode() == ISD::ZERO_EXTEND && Operands[0].getOperand(0).getValueType() == VT) { // The pattern is detected. Subtract one from the constant vector, then // demote it and emit X86ISD::AVG instruction. SDValue One = DAG.getConstant(1, DL, InScalarVT); SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, SmallVector(NumElems, One)); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), Operands[1]); } if (Operands[0].getOpcode() == ISD::ADD) std::swap(Operands[0], Operands[1]); else if (Operands[1].getOpcode() != ISD::ADD) return SDValue(); Operands[2] = Operands[1].getOperand(0); Operands[1] = Operands[1].getOperand(1); // Now we have three operands of two additions. Check that one of them is a // constant vector with ones, and the other two are promoted from i8/i16. for (int i = 0; i < 3; ++i) { if (!IsConstVectorInRange(Operands[i], 1, 1)) continue; std::swap(Operands[i], Operands[2]); // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || Operands[j].getOperand(0).getValueType() != VT) return SDValue(); // The pattern is detected, emit X86ISD::AVG instruction. return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), Operands[1].getOperand(0)); } return SDValue(); } /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { LoadSDNode *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Ptr = Ld->getBasePtr(); SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Alignment); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), std::min(16U, Alignment)); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getUNDEF(RegVT); NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); return DCI.CombineTo(N, NewVec, TF, true); } return SDValue(); } /// PerformMLOADCombine - Resolve extending loads static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { MaskedLoadSDNode *Mld = cast(N); if (Mld->getExtensionType() != ISD::SEXTLOAD) return SDValue(); EVT VT = Mld->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); EVT LdVT = Mld->getMemoryVT(); SDLoc dl(Mld); assert(LdVT != VT && "Cannot extend to the same type"); unsigned ToSz = VT.getVectorElementType().getSizeInBits(); unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"); unsigned SizeRatio = ToSz / FromSz; assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), LdVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); // Convert Src0 value SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); } // Prepare the new mask SDValue NewMask; SDValue Mask = Mld->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type NewMask = DAG.getBitcast(WideVecVT, Mask); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; SmallVector Ops(NumConcat); SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); Ops[0] = Mask; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = ZeroVal; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } /// PerformMSTORECombine - Resolve truncating stores static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { MaskedStoreSDNode *Mst = cast(N); if (!Mst->isTruncatingStore()) return SDValue(); EVT VT = Mst->getValue().getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. if (TLI.isTruncStoreLegal(VT, StVT)) return SDValue(); // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. assert (((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type NewMask = DAG.getBitcast(WideVecVT, Mask); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) ShuffleVec[i] = NumElems*SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; SmallVector Ops(NumConcat); SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); Ops[0] = Mask; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = ZeroVal; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), NewMask, StVT, Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { StoreSDNode *St = cast(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. bool Fast; unsigned AddressSpace = St->getAddressSpace(); unsigned Alignment = St->getAlignment(); if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), Alignment); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), std::min(16U, Alignment)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { // Check if we can detect an AVG pattern from the truncation. If yes, // replace the trunc store by a normal store with the result of X86ISD::AVG // instruction. SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); if (Avg.getNode()) return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. if (TLI.isTruncStoreLegal(VT, StVT)) return SDValue(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromSz) % ToSz) return SDValue(); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && (64 <= NumElems * ToSz)) StoreType = MVT::f64; // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); SmallVector Chains; SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i, dl)); SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. if (VT.getSizeInBits() != 64) return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { SDNode* LdVal = St->getValue().getNode(); LoadSDNode *Ld = nullptr; int TokenFactorIndex = -1; SmallVector Ops; SDNode* ChainVal = St->getChain().getNode(); // Must be a store of a load. We currently handle two cases: the load // is a direct child, and it's under an intervening TokenFactor. It is // possible to dig deeper under nested TokenFactors. if (ChainVal == LdVal) Ld = cast(St->getChain()); else if (St->getValue().hasOneUse() && ChainVal->getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { if (ChainVal->getOperand(i).getNode() == LdVal) { TokenFactorIndex = i; Ld = cast(St->getValue()); } else Ops.push_back(ChainVal->getOperand(i)); } } if (!Ld || !ISD::isNormalLoad(Ld)) return SDValue(); // If this is not the MMX case, i.e. we are just turning i64 load/store // into f64 load/store, avoid the transformation if there are multiple // uses of the loaded value. if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget->is64Bit() || F64IsLegal) { MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); } // Otherwise, lower to two pairs of 32-bit loads / stores. SDValue LoAddr = Ld->getBasePtr(); SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, DAG.getConstant(4, LdDL, MVT::i32)); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), MinAlign(Ld->getAlignment(), 4)); SDValue NewChain = LoLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(LoLd); Ops.push_back(HiLd); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } LoAddr = St->getBasePtr(); HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, DAG.getConstant(4, StDL, MVT::i32)); SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4), St->isVolatile(), St->isNonTemporal(), MinAlign(St->getAlignment(), 4)); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } // This is similar to the above case, but here we handle a scalar 64-bit // integer store that is extracted from a vector on a 32-bit target. // If we have SSE2, then we can treat it like a floating-point double // to get past legalization. The execution dependencies fixup pass will // choose the optimal machine instruction for the store if this really is // an integer or v2f32 rather than an f64. if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() && St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue OldExtract = St->getOperand(1); SDValue ExtOp0 = OldExtract.getOperand(0); unsigned VecSize = ExtOp0.getValueSizeInBits(); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); } return SDValue(); } /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, /// returning the resulting values in a vector. For example, if /// A = < float a0, float a1, float a2, float a3 > /// and /// B = < float b0, float b1, float b2, float b3 > /// then the result of doing a horizontal operation on A and B is /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > // and // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. // At least one of the operands should be a vector shuffle. if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to // operate independently on 128-bit lanes. unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts / NumLanes; assert((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"); unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle then pretend it is the shuffle // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; SmallVector LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) A = LHS.getOperand(0); if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) B = LHS.getOperand(1); ArrayRef Mask = cast(LHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; SmallVector RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) C = RHS.getOperand(0); if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) D = RHS.getOperand(1); ArrayRef Mask = cast(RHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } // Check that the shuffles are both shuffling the same vectors. if (!(A == C && B == D) && !(A == D && B == C)) return false; // If everything is UNDEF then bail out: it would be better to fold to UNDEF. if (!A.getNode() && !B.getNode()) return false; // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) ShuffleVectorSDNode::commuteMask(RMask); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0; i != NumLaneElts; ++i) { int LIdx = LMask[i+l], RIdx = RMask[i+l]; // Ignore any UNDEF components. if (LIdx < 0 || RIdx < 0 || (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. unsigned Src = (i/HalfLaneElts); // each lane is split between srcs int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. return true; } /// Do target-specific dag combines on floating point adds. static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); return SDValue(); } /// Do target-specific dag combines on floating point subs. static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); return SDValue(); } /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. static SDValue combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, SmallVector &Regs) { assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() == MVT::v2i64)); EVT OutVT = N->getValueType(0); EVT OutSVT = OutVT.getVectorElementType(); EVT InVT = Regs[0].getValueType(); EVT InSVT = InVT.getVectorElementType(); SDLoc DL(N); // First, use mask to unset all bits that won't appear in the result. assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && "OutSVT can only be either i8 or i16."); SDValue MaskVal = DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); SDValue MaskVec = DAG.getNode( ISD::BUILD_VECTOR, DL, InVT, SmallVector(InVT.getVectorNumElements(), MaskVal)); for (auto &Reg : Regs) Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); MVT UnpackedVT, PackedVT; if (OutSVT == MVT::i8) { UnpackedVT = MVT::v8i16; PackedVT = MVT::v16i8; } else { UnpackedVT = MVT::v4i32; PackedVT = MVT::v8i16; } // In each iteration, truncate the type by a half size. auto RegNum = Regs.size(); for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); j < e; j *= 2, RegNum /= 2) { for (unsigned i = 0; i < RegNum; i++) Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); for (unsigned i = 0; i < RegNum / 2; i++) Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], Regs[i * 2 + 1]); } // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and // then extract a subvector as the result since v8i8 is not a legal type. if (OutVT == MVT::v8i8) { Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], DAG.getIntPtrConstant(0, DL)); return Regs[0]; } else if (RegNum > 1) { Regs.resize(RegNum); return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); } else return Regs[0]; } /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. static SDValue combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, SmallVector &Regs) { assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); EVT OutVT = N->getValueType(0); SDLoc DL(N); // Shift left by 16 bits, then arithmetic-shift right by 16 bits. SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); for (auto &Reg : Regs) { Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); } for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], Regs[i * 2 + 1]); if (Regs.size() > 2) { Regs.resize(Regs.size() / 2); return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); } else return Regs[0]; } /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type /// legalization the truncation will be translated into a BUILD_VECTOR with each /// element that is extracted from a vector and then truncated, and it is /// diffcult to do this optimization based on them. static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT OutVT = N->getValueType(0); if (!OutVT.isVector()) return SDValue(); SDValue In = N->getOperand(0); if (!In.getValueType().isSimple()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = OutVT.getVectorNumElements(); // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on // SSE2, and we need to take care of it specially. // AVX512 provides vpmovdb. if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) return SDValue(); EVT OutSVT = OutVT.getVectorElementType(); EVT InSVT = InVT.getVectorElementType(); if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && NumElems >= 8)) return SDValue(); // SSSE3's pshufb results in less instructions in the cases below. if (Subtarget->hasSSSE3() && NumElems == 8 && ((OutSVT == MVT::i8 && InSVT != MVT::i64) || (InSVT == MVT::i32 && OutSVT == MVT::i16))) return SDValue(); SDLoc DL(N); // Split a long vector into vectors of legal type. unsigned RegNum = InVT.getSizeInBits() / 128; SmallVector SubVec(RegNum); if (InSVT == MVT::i32) { for (unsigned i = 0; i < RegNum; i++) SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(i * 4, DL)); } else { for (unsigned i = 0; i < RegNum; i++) SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(i * 2, DL)); } // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to // truncate 2 x v4i32 to v8i16. if (Subtarget->hasSSE41() || OutSVT == MVT::i8) return combineVectorTruncationWithPACKUS(N, DAG, SubVec); else if (InSVT == MVT::i32) return combineVectorTruncationWithPACKSS(N, DAG, SubVec); else return SDValue(); } static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // Try to detect AVG pattern first. SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, SDLoc(N)); if (Avg.getNode()) return Avg; return combineVectorTruncation(N, DAG, Subtarget); } /// Do target-specific dag combines on floating point negations. static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); SDValue Arg = N->getOperand(0); SDLoc DL(N); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Zero); } // If we're negating a FMA node, then we can adjust the // instruction to include the extra negation. if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { case X86ISD::FMADD: return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Arg.getOperand(2)); case X86ISD::FMSUB: return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Arg.getOperand(2)); case X86ISD::FNMADD: return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Arg.getOperand(2)); case X86ISD::FNMSUB: return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Arg.getOperand(2)); } } return SDValue(); } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (VT.is512BitVector() && !Subtarget->hasDQI()) { // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. // These logic operations may be executed in the integer domain. SDLoc dl(N); MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); unsigned IntOpcode = 0; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected FP logic op"); case X86ISD::FOR: IntOpcode = ISD::OR; break; case X86ISD::FXOR: IntOpcode = ISD::XOR; break; case X86ISD::FAND: IntOpcode = ISD::AND; break; case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); } return SDValue(); } /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); // Only perform optimizations if UnsafeMath is used. if (!DAG.getTarget().Options.UnsafeFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { if (Subtarget->useSoftFloat()) return SDValue(); // TODO: Check for global or instruction-level "nnan". In that case, we // should be able to lower to FMAX/FMIN alone. // TODO: If an operand is already known to be a NaN or not a NaN, this // should be an optional swap and FMAX/FMIN. EVT VT = N->getValueType(0); if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) return SDValue(); // This takes at least 3 instructions, so favor a library call when operating // on a scalar and minimizing code size. if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( DAG.getDataLayout(), *DAG.getContext(), VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: // Op1 // Num NaN // ---------------- // Num | Max | Op0 | // Op0 ---------------- // NaN | Op1 | NaN | // ---------------- // // The SSE FP max/min instructions were not designed for this case, but rather // to implement: // Min = Op1 < Op0 ? Op1 : Op0 // Max = Op1 > Op0 ? Op1 : Op0 // // So they always return Op0 if either input is a NaN. However, we can still // use those instructions for fmaxnum by selecting away a NaN input. // If either operand is NaN, the 2nd source operand (Op0) is passed through. auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); } /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); // FANDN(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); return lowerX86FPLogicOp(N, DAG, Subtarget); } static SDValue PerformBTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // BT ignores high bits in the bit index operand. SDValue Op1 = N->getOperand(1); if (Op1.hasOneUse()) { unsigned BitWidth = Op1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); } return SDValue(); } static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { SDValue Op = N->getOperand(0); if (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); EVT VT = N->getValueType(0), OpVT = Op.getValueType(); if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } return SDValue(); } static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right // operation on a vector with 64-bit elements. //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } } return SDValue(); } /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities /// to combine math ops, use an LEA, or use a complex addressing mode. This can /// eliminate extend, add, and shift instructions. static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // TODO: This should be valid for other integer types. EVT VT = Sext->getValueType(0); if (VT != MVT::i64) return SDValue(); // We need an 'add nsw' feeding into the 'sext'. SDValue Add = Sext->getOperand(0); if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) return SDValue(); // Having a constant operand to the 'add' ensures that we are not increasing // the instruction count because the constant is extended for free below. // A constant operand can also become the displacement field of an LEA. auto *AddOp1 = dyn_cast(Add.getOperand(1)); if (!AddOp1) return SDValue(); // Don't make the 'add' bigger if there's no hope of combining it with some // other 'add' or 'shl' instruction. // TODO: It may be profitable to generate simpler LEA instructions in place // of single 'add' instructions, but the cost model for selecting an LEA // currently has a high threshold. bool HasLEAPotential = false; for (auto *User : Sext->uses()) { if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { HasLEAPotential = true; break; } } if (!HasLEAPotential) return SDValue(); // Everything looks good, so pull the 'sext' ahead of the 'add'. int64_t AddConstant = AddOp1->getSExtValue(); SDValue AddOp0 = Add.getOperand(0); SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); // The wider add is guaranteed to not wrap because both operands are // sign-extended. SDNodeFlags Flags; Flags.setNoSignedWrap(true); return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); } /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly /// extends from AH (which we otherwise need to do contortions to access). static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); auto OpcodeN = N->getOpcode(); auto OpcodeN0 = N0.getOpcode(); if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) return SDValue(); EVT VT = N->getValueType(0); EVT InVT = N0.getValueType(); if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) return SDValue(); SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG : X86ISD::UDIVREM8_ZEXT_HREG; SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); return R.getValue(1); } static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); EVT InVT = N0.getValueType(); EVT InSVT = InVT.getScalarType(); SDLoc DL(N); if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); } return SDValue(); } if (VT.isVector() && Subtarget->hasSSE2()) { auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { EVT InVT = N.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), Size / InVT.getScalarSizeInBits()); SmallVector Opnds(Size / InVT.getSizeInBits(), DAG.getUNDEF(InVT)); Opnds[0] = N; return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); }; // If target-size is less than 128-bits, extend to a type that would extend // to 128 bits, extend that and extract the original target vector. if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) && (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { unsigned Scale = 128 / VT.getSizeInBits(); EVT ExVT = EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, DAG.getIntPtrConstant(0, DL)); } // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG // which ensures lowering to X86ISD::VSEXT (pmovsx*). if (VT.getSizeInBits() == 128 && (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { SDValue ExOp = ExtendVecSize(DL, N0, 128); return DAG.getSignExtendVectorInReg(ExOp, DL, VT); } // On pre-AVX2 targets, split into 128-bit nodes of // ISD::SIGN_EXTEND_VECTOR_INREG. if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { unsigned NumVecs = VT.getSizeInBits() / 128; unsigned NumSubElts = 128 / SVT.getSizeInBits(); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); SmallVector Opnds; for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); SrcVec = ExtendVecSize(DL, SrcVec, 128); SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); Opnds.push_back(SrcVec); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); } } if (Subtarget->hasAVX() && VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) return NewAdd; return SDValue(); } static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget* Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); bool NegA = (A.getOpcode() == ISD::FNEG); bool NegB = (B.getOpcode() == ISD::FNEG); bool NegC = (C.getOpcode() == ISD::FNEG); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); if (NegA) A = A.getOperand(0); if (NegB) B = B.getOperand(0); if (NegC) C = C.getOperand(0); unsigned Opcode; if (!NegMul) Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; return DAG.getNode(Opcode, dl, VT, A, B, C); } static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because // ISD::SETCC is always legalized to i8. SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, dl, VT)); } } if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, dl, VT)); } } if (VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; return SDValue(); } // Optimize x == -y --> x+y == 0 // x != -y --> x+y != 0 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget* Subtarget) { ISD::CondCode CC = cast(N->getOperand(2))->get(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, LHS.getOperand(1)); return DAG.getSetCC(DL, N->getValueType(0), addV, DAG.getConstant(0, DL, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, RHS.getOperand(1)); return DAG.getSetCC(DL, N->getValueType(0), addV, DAG.getConstant(0, DL, addV.getValueType()), CC); } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); if (!IsSEXT0 || !IsVZero1) { // Swap the operands and update the condition code. std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); } if (IsSEXT0 && IsVZero1) { assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); if (CC == ISD::SETGT) return DAG.getConstant(0, DL, VT); if (CC == ISD::SETLE) return DAG.getConstant(1, DL, VT); if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); assert((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"); return LHS.getOperand(0); } } return SDValue(); } static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // Gather and Scatter instructions use k-registers for masks. The type of // the masks is v*i1. So the mask will be truncated anyway. // The SIGN_EXTEND_INREG my be dropped. SDValue Mask = N->getOperand(2); if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[2] = Mask.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); } return SDValue(); } // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, MVT VT) { if (VT == MVT::i8) return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, DL, MVT::i8), EFLAGS), DAG.getConstant(1, DL, VT)); assert (VT == MVT::i1 && "Unexpected type for SECCC node"); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, DL, MVT::i8), EFLAGS)); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); if (CC == X86::COND_A) { // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); } } // Materialize "setb reg" as "sbb reg,reg", since it can be extended without // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. if (CC == X86::COND_B) return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } return SDValue(); } // Optimize branch condition evaluation. // static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, Flags); } return SDValue(); } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); return Res; } return SDValue(); } static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } return SDValue(); } static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 if (VT == MVT::f16) return SDValue(); if (!Ld->isVolatile() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget->is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } } return SDValue(); } // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. if (X86::isZeroNode(N->getOperand(0)) && X86::isZeroNode(N->getOperand(1)) && // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), N->getOperand(2)), DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } return SDValue(); } // fold (add Y, (sete X, 0)) -> adc 0, Y // (add Y, (setne X, 0)) -> sbb -1, Y // (sub (sete X, 0), Y) -> sbb 0, Y // (sub (setne X, 0), Y) -> adc -1, Y static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // Look through ZExts. SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) return SDValue(); SDValue SetCC = Ext.getOperand(0); if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) return SDValue(); X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); SDValue Cmp = SetCC.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); SDValue CmpOp0 = Cmp.getOperand(0); SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); if (CC == X86::COND_NE) return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, DL, OtherVal.getValueType(), OtherVal, DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), NewCmp); return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, DL, OtherVal.getValueType(), OtherVal, DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); } /// PerformADDCombine - Do target-specific dag combines on integer adds. static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // X86 can't encode an immediate LHS of a sub. See if we can push the // negation into a preceding instruction. if (ConstantSDNode *C = dyn_cast(Op0)) { // If the RHS of the sub is a XOR with one use and a constant, invert the // immediate. Then add one to the LHS of the sub so we can turn // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa(Op1.getOperand(1))) { APInt XorC = cast(Op1.getOperand(1))->getAPIntValue(); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getConstant(~XorC, SDLoc(Op1), VT)); return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); } } // Try to synthesize horizontal adds from adds of shuffles. EVT VT = N->getValueType(0); if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc DL(N); MVT VT = N->getSimpleValueType(0); SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); // (vzext (bitcast (vzext (x)) -> (vzext x) SDValue V = Op; while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V != Op && V.getOpcode() == X86ISD::VZEXT) { MVT InnerVT = V.getSimpleValueType(); MVT InnerEltVT = InnerVT.getVectorElementType(); // If the element sizes match exactly, we can just do one larger vzext. This // is always an exact type match as vzext operates on integer types. if (OpEltVT == InnerEltVT) { assert(OpVT == InnerVT && "Types must match for vzext!"); return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); } // The only other way we can combine them is if only a single element of the // inner vzext is used in the input to the outer vzext. if (InnerEltVT.getSizeInBits() < InputBits) return SDValue(); // In this case, the inner vzext is completely dead because we're going to // only look at bits inside of the low element. Just do the outer vzext on // a bitcast of the input to the inner. return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); } // Check if we can bypass extracting and re-inserting an element of an input // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), OrigVT.getVectorNumElements() / Ratio); OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, DAG.getIntPtrConstant(0, DL)); } Op = DAG.getBitcast(OpVT, OrigV); return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); } } return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, Subtarget); case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); } /// isTypeDesirableForOp - Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 /// instruction encodings are longer and some i16 instructions are slow. bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; if (VT != MVT::i16) return true; switch (Opc) { default: return true; case ISD::LOAD: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: case ISD::SRL: case ISD::SUB: case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: return false; } } /// This function checks if any of the users of EFLAGS copies the EFLAGS. We /// know that the code that lowers COPY of EFLAGS has to use the stack, and if /// we don't adjust the stack we clobber the first frame index. /// See X86InstrInfo::copyPhysReg. bool X86TargetLowering::hasCopyImplyingStackAdjustment( MachineFunction *MF) const { const MachineRegisterInfo &MRI = MF->getRegInfo(); return any_of(MRI.reg_instructions(X86::EFLAGS), [](const MachineInstr &RI) { return RI.isCopy(); }); } /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); if (VT != MVT::i16) return false; bool Promote = false; bool Commute = false; switch (Op.getOpcode()) { default: break; case ISD::LOAD: { LoadSDNode *LD = cast(Op); // If the non-extending load has a single use and it's not live out, then it // might be folded. if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& Op.hasOneUse()*/) { for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) { // The only case where we'd want to promote LOAD (rather then it being // promoted as an operand is when it's only use is liveout. if (UI->getOpcode() != ISD::CopyToReg) return false; } } Promote = true; break; } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Promote = true; break; case ISD::SHL: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) return false; Promote = true; break; } case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: Commute = true; // fallthrough case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); if (!Commute && MayFoldLoad(N1)) return false; // Avoid disabling potential load folding opportunities. if (MayFoldLoad(N0) && (!isa(N1) || MayFoldIntoStore(Op))) return false; if (MayFoldLoad(N1) && (!isa(N0) || MayFoldIntoStore(Op))) return false; Promote = true; } } PVT = MVT::i32; return Promote; } //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// // Helper to match a string separated by whitespace. static bool matchAsm(StringRef S, ArrayRef Pieces) { S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. for (StringRef Piece : Pieces) { if (!S.startswith(Piece)) // Check if the piece matches. return false; S = S.substr(Piece.size()); StringRef::size_type Pos = S.find_first_not_of(" \t"); if (Pos == 0) // We matched a prefix. return false; S = S.substr(Pos); } return S.empty(); } static bool clobbersFlagRegisters(const SmallVector &AsmPieces) { if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { if (AsmPieces.size() == 3) return true; else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) return true; } } return false; } bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); IntegerType *Ty = dyn_cast(CI->getType()); if (!Ty || Ty->getBitWidth() % 16 != 0) return false; // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || matchAsm(AsmPieces[0], {"bswapl", "$0"}) || matchAsm(AsmPieces[0], {"bswapq", "$0"}) || matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); if (Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && matchAsm(AsmPieces[1], {"bswap", "%edx"}) && matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } break; } return false; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': case 'q': case 'Q': case 'f': case 't': case 'u': case 'y': case 'x': case 'Y': case 'l': return C_RegisterClass; case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': return C_Register; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'G': case 'C': case 'e': case 'Z': return C_Other; default: break; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight X86TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); case 'R': case 'q': case 'Q': case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': if (CallOperandVal->getType()->isIntegerTy()) weight = CW_SpecificReg; break; case 'f': case 't': case 'u': if (type->isFloatingPointTy()) weight = CW_SpecificReg; break; case 'y': if (type->isX86_MMXTy() && Subtarget->hasMMX()) weight = CW_SpecificReg; break; case 'x': case 'Y': if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) weight = CW_Register; break; case 'I': if (ConstantInt *C = dyn_cast(info.CallOperandVal)) { if (C->getZExtValue() <= 31) weight = CW_Constant; } break; case 'J': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 63) weight = CW_Constant; } break; case 'K': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) weight = CW_Constant; } break; case 'L': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) weight = CW_Constant; } break; case 'M': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 3) weight = CW_Constant; } break; case 'N': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xff) weight = CW_Constant; } break; case 'G': case 'C': if (isa(CallOperandVal)) { weight = CW_Constant; } break; case 'e': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80000000LL) && (C->getSExtValue() <= 0x7fffffffLL)) weight = CW_Constant; } break; case 'Z': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xffffffff) weight = CW_Constant; } break; } return weight; } /// LowerXConstraint - try to replace an X constraint, which matches anything, /// with another that has more specific requirements based on the type of the /// corresponding operand. const char *X86TargetLowering:: LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { if (Subtarget->hasSSE2()) return "Y"; if (Subtarget->hasSSE1()) return "x"; } return TargetLowering::LowerXConstraint(ConstraintVT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'J': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 63) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'K': if (ConstantSDNode *C = dyn_cast(Op)) { if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'L': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'M': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 3) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'N': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 255) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'O': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 127) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. } return; } case 'Z': { // 32-bit unsigned value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. return; } case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast(Op)) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); break; } // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't // be used as immediates. if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) return; // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. GlobalAddressSDNode *GA = nullptr; int64_t Offset = 0; // Match either (GA), (GA+C), (GA+C1+C2), etc. while (1) { if ((GA = dyn_cast(Op))) { Offset += GA->getOffset(); break; } else if (Op.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Offset += C->getZExtValue(); Op = Op.getOperand(0); continue; } } else if (Op.getOpcode() == ISD::SUB) { if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Offset += -C->getZExtValue(); Op = Op.getOperand(0); continue; } } // Otherwise, this isn't something we can handle, reject it. return; } const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. if (isGlobalStubReference( Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) return; Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), GA->getValueType(0), Offset); break; } } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } std::pair X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { default: break; // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget->is64Bit()) { if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i64 || VT == MVT::f64) return std::make_pair(0U, &X86::GR64RegClass); break; } // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32_ABCDRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_ABCDRegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); if (VT == MVT::i64) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) return std::make_pair(0U, &X86::GR32RegClass); return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || !Subtarget->is64Bit()) return std::make_pair(0U, &X86::GR32_NOREXRegClass); return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP64RegClass); return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. if (!Subtarget->hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget->hasSSE2()) break; // FALL THROUGH. case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget->hasSSE1()) break; switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: case MVT::i32: return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: return std::make_pair(0U, &X86::VR256RegClass); case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: return std::make_pair(0U, &X86::VR512RegClass); } break; } } // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && Constraint[3] == '(' && (Constraint[4] >= '0' && Constraint[4] <= '7') && Constraint[5] == ')' && Constraint[6] == '}') { Res.first = X86::FP0+Constraint[4]-'0'; Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { Res.first = X86::FP0; Res.second = &X86::RFP80RegClass; return Res; } // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) { Res.first = X86::EFLAGS; Res.second = &X86::CCRRegClass; return Res; } // 'A' means EAX + EDX. if (Constraint == "A") { Res.first = X86::EAX; Res.second = &X86::GR32_ADRegClass; return Res; } return Res; } // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. if (Res.second->hasType(VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should // return "eax". This should even work for things like getting 64bit integer // registers when given an f64 type. const TargetRegisterClass *Class = Res.second; if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { unsigned Size = VT.getSizeInBits(); if (Size == 1) Size = 8; unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { Res.first = DestReg; Res.second = Size == 8 ? &X86::GR8RegClass : Size == 16 ? &X86::GR16RegClass : Size == 32 ? &X86::GR32RegClass : &X86::GR64RegClass; assert(Res.second->contains(Res.first) && "Register in register class"); } else { // No register found/type mismatch. Res.first = 0; Res.second = nullptr; } } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass || Class == &X86::VR128RegClass || Class == &X86::VR256RegClass || Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass || Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass || Class == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64RegClass; else if (X86::VR128RegClass.hasType(VT)) Res.second = &X86::VR128RegClass; else if (X86::VR256RegClass.hasType(VT)) Res.second = &X86::VR256RegClass; else if (X86::VR512RegClass.hasType(VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } } return Res; } int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // An indexed folded instruction, i.e., inst (reg1, reg2, scale), // will take 2 allocations in the out of order engine instead of 1 // for plain addressing mode, i.e. inst (reg1). // E.g., // vaddps (%rsi,%drx), %ymm0, %ymm1 // Requires two allocations (one for the load, one for the computation) // whereas: // vaddps (%rsi), %ymm0, %ymm1 // Requires just 1 allocation, i.e., freeing allocations for other operations // and having less micro operations to execute. // // For some X86 architectures, this is even worse because for instance for // stores, the complex addressing mode forces the instruction to use the // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. return AM.Scale != 0; return -1; } bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. // The exception to this is vector division. Since x86 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { if (!Subtarget->is64Bit()) return; // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void X86TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (X86::GR64RegClass.contains(*I)) RC = &X86::GR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); - BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - NewVR) + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) - BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - *I) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } Index: projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp (revision 294609) @@ -1,2930 +1,2759 @@ //===- InstCombineAndOrXor.cpp --------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the visitAnd, visitOr, and visitXor functions. // //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/CmpInstAnalysis.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "instcombine" static inline Value *dyn_castNotVal(Value *V) { // If this is not(not(x)) don't return that this is a not: we want the two // not's to be folded first. if (BinaryOperator::isNot(V)) { Value *Operand = BinaryOperator::getNotArgument(V); if (!IsFreeToInvert(Operand, Operand->hasOneUse())) return Operand; } // Constants can be considered to be not'ed values... if (ConstantInt *C = dyn_cast(V)) return ConstantInt::get(C->getType(), ~C->getValue()); return nullptr; } /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into /// a three bit mask. It also returns whether it is an ordered predicate by /// reference. static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { isOrdered = false; switch (CC) { case FCmpInst::FCMP_ORD: isOrdered = true; return 0; // 000 case FCmpInst::FCMP_UNO: return 0; // 000 case FCmpInst::FCMP_OGT: isOrdered = true; return 1; // 001 case FCmpInst::FCMP_UGT: return 1; // 001 case FCmpInst::FCMP_OEQ: isOrdered = true; return 2; // 010 case FCmpInst::FCMP_UEQ: return 2; // 010 case FCmpInst::FCMP_OGE: isOrdered = true; return 3; // 011 case FCmpInst::FCMP_UGE: return 3; // 011 case FCmpInst::FCMP_OLT: isOrdered = true; return 4; // 100 case FCmpInst::FCMP_ULT: return 4; // 100 case FCmpInst::FCMP_ONE: isOrdered = true; return 5; // 101 case FCmpInst::FCMP_UNE: return 5; // 101 case FCmpInst::FCMP_OLE: isOrdered = true; return 6; // 110 case FCmpInst::FCMP_ULE: return 6; // 110 // True -> 7 default: // Not expecting FCMP_FALSE and FCMP_TRUE; llvm_unreachable("Unexpected FCmp predicate!"); } } /// This is the complement of getICmpCode, which turns an opcode and two /// operands into either a constant true or false, or a brand new ICmp /// instruction. The sign is passed in to determine which kind of predicate to /// use in the new icmp instruction. static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { ICmpInst::Predicate NewPred; if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred)) return NewConstant; return Builder->CreateICmp(NewPred, LHS, RHS); } /// This is the complement of getFCmpCode, which turns an opcode and two /// operands into either a FCmp instruction. isordered is passed in to determine /// which kind of predicate to use in the new fcmp instruction. static Value *getFCmpValue(bool isordered, unsigned code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { CmpInst::Predicate Pred; switch (code) { default: llvm_unreachable("Illegal FCmp code!"); case 0: Pred = isordered ? FCmpInst::FCMP_ORD : FCmpInst::FCMP_UNO; break; case 1: Pred = isordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; break; case 2: Pred = isordered ? FCmpInst::FCMP_OEQ : FCmpInst::FCMP_UEQ; break; case 3: Pred = isordered ? FCmpInst::FCMP_OGE : FCmpInst::FCMP_UGE; break; case 4: Pred = isordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; break; case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break; case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break; case 7: if (!isordered) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); Pred = FCmpInst::FCMP_ORD; break; } return Builder->CreateFCmp(Pred, LHS, RHS); } /// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) to BSWAP(BITWISE_OP(A, B)) /// \param I Binary operator to transform. /// \return Pointer to node that must replace the original binary operator, or /// null pointer if no transformation was made. Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) { IntegerType *ITy = dyn_cast(I.getType()); // Can't do vectors. if (I.getType()->isVectorTy()) return nullptr; // Can only do bitwise ops. unsigned Op = I.getOpcode(); if (Op != Instruction::And && Op != Instruction::Or && Op != Instruction::Xor) return nullptr; Value *OldLHS = I.getOperand(0); Value *OldRHS = I.getOperand(1); ConstantInt *ConstLHS = dyn_cast(OldLHS); ConstantInt *ConstRHS = dyn_cast(OldRHS); IntrinsicInst *IntrLHS = dyn_cast(OldLHS); IntrinsicInst *IntrRHS = dyn_cast(OldRHS); bool IsBswapLHS = (IntrLHS && IntrLHS->getIntrinsicID() == Intrinsic::bswap); bool IsBswapRHS = (IntrRHS && IntrRHS->getIntrinsicID() == Intrinsic::bswap); if (!IsBswapLHS && !IsBswapRHS) return nullptr; if (!IsBswapLHS && !ConstLHS) return nullptr; if (!IsBswapRHS && !ConstRHS) return nullptr; /// OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) ) /// OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) ) Value *NewLHS = IsBswapLHS ? IntrLHS->getOperand(0) : Builder->getInt(ConstLHS->getValue().byteSwap()); Value *NewRHS = IsBswapRHS ? IntrRHS->getOperand(0) : Builder->getInt(ConstRHS->getValue().byteSwap()); Value *BinOp = nullptr; if (Op == Instruction::And) BinOp = Builder->CreateAnd(NewLHS, NewRHS); else if (Op == Instruction::Or) BinOp = Builder->CreateOr(NewLHS, NewRHS); else //if (Op == Instruction::Xor) BinOp = Builder->CreateXor(NewLHS, NewRHS); Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy); return Builder->CreateCall(F, BinOp); } /// This handles expressions of the form ((val OP C1) & C2). Where /// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is /// guaranteed to be a binary operator. Instruction *InstCombiner::OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, BinaryOperator &TheAnd) { Value *X = Op->getOperand(0); Constant *Together = nullptr; if (!Op->isShift()) Together = ConstantExpr::getAnd(AndRHS, OpRHS); switch (Op->getOpcode()) { case Instruction::Xor: if (Op->hasOneUse()) { // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2) Value *And = Builder->CreateAnd(X, AndRHS); And->takeName(Op); return BinaryOperator::CreateXor(And, Together); } break; case Instruction::Or: if (Op->hasOneUse()){ if (Together != OpRHS) { // (X | C1) & C2 --> (X | (C1&C2)) & C2 Value *Or = Builder->CreateOr(X, Together); Or->takeName(Op); return BinaryOperator::CreateAnd(Or, AndRHS); } ConstantInt *TogetherCI = dyn_cast(Together); if (TogetherCI && !TogetherCI->isZero()){ // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1 // NOTE: This reduces the number of bits set in the & mask, which // can expose opportunities for store narrowing. Together = ConstantExpr::getXor(AndRHS, Together); Value *And = Builder->CreateAnd(X, Together); And->takeName(Op); return BinaryOperator::CreateOr(And, OpRHS); } } break; case Instruction::Add: if (Op->hasOneUse()) { // Adding a one to a single bit bit-field should be turned into an XOR // of the bit. First thing to check is to see if this AND is with a // single bit constant. const APInt &AndRHSV = AndRHS->getValue(); // If there is only one bit set. if (AndRHSV.isPowerOf2()) { // Ok, at this point, we know that we are masking the result of the // ADD down to exactly one bit. If the constant we are adding has // no bits set below this bit, then we can eliminate the ADD. const APInt& AddRHS = OpRHS->getValue(); // Check to see if any bits below the one bit set in AndRHSV are set. if ((AddRHS & (AndRHSV-1)) == 0) { // If not, the only thing that can effect the output of the AND is // the bit specified by AndRHSV. If that bit is set, the effect of // the XOR is to toggle the bit. If it is clear, then the ADD has // no effect. if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop TheAnd.setOperand(0, X); return &TheAnd; } else { // Pull the XOR out of the AND. Value *NewAnd = Builder->CreateAnd(X, AndRHS); NewAnd->takeName(Op); return BinaryOperator::CreateXor(NewAnd, AndRHS); } } } } break; case Instruction::Shl: { // We know that the AND will not produce any of the bits shifted in, so if // the anded constant includes them, clear them now! // uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal)); ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShlMask); if (CI->getValue() == ShlMask) // Masking out bits that the shift already masks. return ReplaceInstUsesWith(TheAnd, Op); // No need for the and. if (CI != AndRHS) { // Reducing bits set in and. TheAnd.setOperand(1, CI); return &TheAnd; } break; } case Instruction::LShr: { // We know that the AND will not produce any of the bits shifted in, so if // the anded constant includes them, clear them now! This only applies to // unsigned shifts, because a signed shr may bring in set bits! // uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShrMask); if (CI->getValue() == ShrMask) // Masking out bits that the shift already masks. return ReplaceInstUsesWith(TheAnd, Op); if (CI != AndRHS) { TheAnd.setOperand(1, CI); // Reduce bits set in and cst. return &TheAnd; } break; } case Instruction::AShr: // Signed shr. // See if this is shifting in some sign extension, then masking it out // with an and. if (Op->hasOneUse()) { uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); Constant *C = Builder->getInt(AndRHS->getValue() & ShrMask); if (C == AndRHS) { // Masking out bits shifted in. // (Val ashr C1) & C2 -> (Val lshr C1) & C2 // Make the argument unsigned. Value *ShVal = Op->getOperand(0); ShVal = Builder->CreateLShr(ShVal, OpRHS, Op->getName()); return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName()); } } break; } return nullptr; } /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise /// (V < Lo || V >= Hi). In practice, we emit the more efficient /// (V-Lo) \(ConstantExpr::getICmp((isSigned ? ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() && "Lo is not <= Hi in range emission code!"); if (Inside) { if (Lo == Hi) // Trivially false. return Builder->getFalse(); // V >= Min && V < Hi --> V < Hi if (cast(Lo)->isMinValue(isSigned)) { ICmpInst::Predicate pred = (isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT); return Builder->CreateICmp(pred, V, Hi); } // Emit V-Lo CreateAdd(V, NegLo, V->getName()+".off"); Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi); return Builder->CreateICmpULT(Add, UpperBound); } if (Lo == Hi) // Trivially true. return Builder->getTrue(); // V < Min || V >= Hi -> V > Hi-1 Hi = SubOne(cast(Hi)); if (cast(Lo)->isMinValue(isSigned)) { ICmpInst::Predicate pred = (isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT); return Builder->CreateICmp(pred, V, Hi); } // Emit V-Lo >u Hi-1-Lo // Note that Hi has already had one subtracted from it, above. ConstantInt *NegLo = cast(ConstantExpr::getNeg(Lo)); Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off"); Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi); return Builder->CreateICmpUGT(Add, LowerBound); } /// Returns true iff Val consists of one contiguous run of 1s with any number /// of 0s on either side. The 1s are allowed to wrap from LSB to MSB, /// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is /// not, since all 1s are not contiguous. static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { const APInt& V = Val->getValue(); uint32_t BitWidth = Val->getType()->getBitWidth(); if (!APIntOps::isShiftedMask(BitWidth, V)) return false; // look for the first zero bit after the run of ones MB = BitWidth - ((V - 1) ^ V).countLeadingZeros(); // look for the first non-zero bit ME = V.getActiveBits(); return true; } /// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines /// whether the operator is a sub. If we can fold one of the following xforms: /// /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 /// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 /// /// return (A +/- B). /// Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask, bool isSub, Instruction &I) { Instruction *LHSI = dyn_cast(LHS); if (!LHSI || LHSI->getNumOperands() != 2 || !isa(LHSI->getOperand(1))) return nullptr; ConstantInt *N = cast(LHSI->getOperand(1)); switch (LHSI->getOpcode()) { default: return nullptr; case Instruction::And: if (ConstantExpr::getAnd(N, Mask) == Mask) { // If the AndRHS is a power of two minus one (0+1+), this is simple. if ((Mask->getValue().countLeadingZeros() + Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()) break; // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+ // part, we don't need any explicit masks to take them out of A. If that // is all N is, ignore it. uint32_t MB = 0, ME = 0; if (isRunOfOnes(Mask, MB, ME)) { // begin/end bit of run, inclusive uint32_t BitWidth = cast(RHS->getType())->getBitWidth(); APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1)); if (MaskedValueIsZero(RHS, Mask, 0, &I)) break; } } return nullptr; case Instruction::Or: case Instruction::Xor: // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0 if ((Mask->getValue().countLeadingZeros() + Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth() && ConstantExpr::getAnd(N, Mask)->isNullValue()) break; return nullptr; } if (isSub) return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold"); return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold"); } /// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C) /// One of A and B is considered the mask, the other the value. This is /// described as the "AMask" or "BMask" part of the enum. If the enum /// contains only "Mask", then both A and B can be considered masks. /// If A is the mask, then it was proven, that (A & C) == C. This /// is trivial if C == A, or C == 0. If both A and C are constants, this /// proof is also easy. /// For the following explanations we assume that A is the mask. /// The part "AllOnes" declares, that the comparison is true only /// if (A & B) == A, or all bits of A are set in B. /// Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes /// The part "AllZeroes" declares, that the comparison is true only /// if (A & B) == 0, or all bits of A are cleared in B. /// Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes /// The part "Mixed" declares, that (A & B) == C and C might or might not /// contain any number of one bits and zero bits. /// Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed /// The Part "Not" means, that in above descriptions "==" should be replaced /// by "!=". /// Example: (icmp ne (A & 3), 3) -> FoldMskICmp_AMask_NotAllOnes /// If the mask A contains a single bit, then the following is equivalent: /// (icmp eq (A & B), A) equals (icmp ne (A & B), 0) /// (icmp ne (A & B), A) equals (icmp eq (A & B), 0) enum MaskedICmpType { FoldMskICmp_AMask_AllOnes = 1, FoldMskICmp_AMask_NotAllOnes = 2, FoldMskICmp_BMask_AllOnes = 4, FoldMskICmp_BMask_NotAllOnes = 8, FoldMskICmp_Mask_AllZeroes = 16, FoldMskICmp_Mask_NotAllZeroes = 32, FoldMskICmp_AMask_Mixed = 64, FoldMskICmp_AMask_NotMixed = 128, FoldMskICmp_BMask_Mixed = 256, FoldMskICmp_BMask_NotMixed = 512 }; /// Return the set of pattern classes (from MaskedICmpType) /// that (icmp SCC (A & B), C) satisfies. static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, ICmpInst::Predicate SCC) { ConstantInt *ACst = dyn_cast(A); ConstantInt *BCst = dyn_cast(B); ConstantInt *CCst = dyn_cast(C); bool icmp_eq = (SCC == ICmpInst::ICMP_EQ); bool icmp_abit = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2()); bool icmp_bbit = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2()); unsigned result = 0; if (CCst && CCst->isZero()) { // if C is zero, then both A and B qualify as mask result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed | FoldMskICmp_BMask_Mixed) : (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed | FoldMskICmp_BMask_NotMixed)); if (icmp_abit) result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_AMask_NotMixed) : (FoldMskICmp_AMask_AllOnes | FoldMskICmp_AMask_Mixed)); if (icmp_bbit) result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes | FoldMskICmp_BMask_NotMixed) : (FoldMskICmp_BMask_AllOnes | FoldMskICmp_BMask_Mixed)); return result; } if (A == C) { result |= (icmp_eq ? (FoldMskICmp_AMask_AllOnes | FoldMskICmp_AMask_Mixed) : (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_AMask_NotMixed)); if (icmp_abit) result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed) : (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed)); } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) { result |= (icmp_eq ? FoldMskICmp_AMask_Mixed : FoldMskICmp_AMask_NotMixed); } if (B == C) { result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes | FoldMskICmp_BMask_Mixed) : (FoldMskICmp_BMask_NotAllOnes | FoldMskICmp_BMask_NotMixed)); if (icmp_bbit) result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotMixed) : (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_BMask_Mixed)); } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) { result |= (icmp_eq ? FoldMskICmp_BMask_Mixed : FoldMskICmp_BMask_NotMixed); } return result; } /// Convert an analysis of a masked ICmp into its equivalent if all boolean /// operations had the opposite sense. Since each "NotXXX" flag (recording !=) /// is adjacent to the corresponding normal flag (recording ==), this just /// involves swapping those bits over. static unsigned conjugateICmpMask(unsigned Mask) { unsigned NewMask; NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes | FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed | FoldMskICmp_BMask_Mixed)) << 1; NewMask |= (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes | FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed | FoldMskICmp_BMask_NotMixed)) >> 1; return NewMask; } /// Decompose an icmp into the form ((X & Y) pred Z) if possible. /// The returned predicate is either == or !=. Returns false if /// decomposition fails. static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, Value *&X, Value *&Y, Value *&Z) { ConstantInt *C = dyn_cast(I->getOperand(1)); if (!C) return false; switch (I->getPredicate()) { default: return false; case ICmpInst::ICMP_SLT: // X < 0 is equivalent to (X & SignBit) != 0. if (!C->isZero()) return false; Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth())); Pred = ICmpInst::ICMP_NE; break; case ICmpInst::ICMP_SGT: // X > -1 is equivalent to (X & SignBit) == 0. if (!C->isAllOnesValue()) return false; Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth())); Pred = ICmpInst::ICMP_EQ; break; case ICmpInst::ICMP_ULT: // X getValue().isPowerOf2()) return false; Y = ConstantInt::get(I->getContext(), -C->getValue()); Pred = ICmpInst::ICMP_EQ; break; case ICmpInst::ICMP_UGT: // X >u 2^n-1 is equivalent to (X & ~(2^n-1)) != 0. if (!(C->getValue() + 1).isPowerOf2()) return false; Y = ConstantInt::get(I->getContext(), ~C->getValue()); Pred = ICmpInst::ICMP_NE; break; } X = I->getOperand(0); Z = ConstantInt::getNullValue(C->getType()); return true; } /// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) /// Return the set of pattern classes (from MaskedICmpType) /// that both LHS and RHS satisfy. static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, Value*& B, Value*& C, Value*& D, Value*& E, ICmpInst *LHS, ICmpInst *RHS, ICmpInst::Predicate &LHSCC, ICmpInst::Predicate &RHSCC) { if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0; // vectors are not (yet?) supported if (LHS->getOperand(0)->getType()->isVectorTy()) return 0; // Here comes the tricky part: // LHS might be of the form L11 & L12 == X, X == L21 & L22, // and L11 & L12 == L21 & L22. The same goes for RHS. // Now we must find those components L** and R**, that are equal, so // that we can extract the parameters A, B, C, D, and E for the canonical // above. Value *L1 = LHS->getOperand(0); Value *L2 = LHS->getOperand(1); Value *L11,*L12,*L21,*L22; // Check whether the icmp can be decomposed into a bit test. if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) { L21 = L22 = L1 = nullptr; } else { // Look for ANDs in the LHS icmp. if (!L1->getType()->isIntegerTy()) { // You can icmp pointers, for example. They really aren't masks. L11 = L12 = nullptr; } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) { // Any icmp can be viewed as being trivially masked; if it allows us to // remove one, it's worth it. L11 = L1; L12 = Constant::getAllOnesValue(L1->getType()); } if (!L2->getType()->isIntegerTy()) { // You can icmp pointers, for example. They really aren't masks. L21 = L22 = nullptr; } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) { L21 = L2; L22 = Constant::getAllOnesValue(L2->getType()); } } // Bail if LHS was a icmp that can't be decomposed into an equality. if (!ICmpInst::isEquality(LHSCC)) return 0; Value *R1 = RHS->getOperand(0); Value *R2 = RHS->getOperand(1); Value *R11,*R12; bool ok = false; if (decomposeBitTestICmp(RHS, RHSCC, R11, R12, R2)) { if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { A = R12; D = R11; } else { return 0; } E = R2; R1 = nullptr; ok = true; } else if (R1->getType()->isIntegerTy()) { if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) { // As before, model no mask as a trivial mask if it'll let us do an // optimization. R11 = R1; R12 = Constant::getAllOnesValue(R1->getType()); } if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; E = R2; ok = true; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { A = R12; D = R11; E = R2; ok = true; } } // Bail if RHS was a icmp that can't be decomposed into an equality. if (!ICmpInst::isEquality(RHSCC)) return 0; // Look for ANDs in on the right side of the RHS icmp. if (!ok && R2->getType()->isIntegerTy()) { if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) { R11 = R2; R12 = Constant::getAllOnesValue(R2->getType()); } if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; E = R1; ok = true; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { A = R12; D = R11; E = R1; ok = true; } else { return 0; } } if (!ok) return 0; if (L11 == A) { B = L12; C = L2; } else if (L12 == A) { B = L11; C = L2; } else if (L21 == A) { B = L22; C = L1; } else if (L22 == A) { B = L21; C = L1; } unsigned left_type = getTypeOfMaskedICmp(A, B, C, LHSCC); unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC); return left_type & right_type; } /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) /// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy *Builder) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS, LHSCC, RHSCC); if (mask == 0) return nullptr; assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) && "foldLogOpOfMaskedICmpsHelper must return an equality predicate."); // In full generality: // (icmp (A & B) Op C) | (icmp (A & D) Op E) // == ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ] // // If the latter can be converted into (icmp (A & X) Op Y) then the former is // equivalent to (icmp (A & X) !Op Y). // // Therefore, we can pretend for the rest of this function that we're dealing // with the conjunction, provided we flip the sense of any comparisons (both // input and output). // In most cases we're going to produce an EQ for the "&&" case. ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; if (!IsAnd) { // Convert the masking analysis into its equivalent with negated // comparisons. mask = conjugateICmpMask(mask); } if (mask & FoldMskICmp_Mask_AllZeroes) { // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) // -> (icmp eq (A & (B|D)), 0) Value *newOr = Builder->CreateOr(B, D); Value *newAnd = Builder->CreateAnd(A, newOr); // we can't use C as zero, because we might actually handle // (icmp ne (A & B), B) & (icmp ne (A & D), D) // with B and D, having a single bit set Value *zero = Constant::getNullValue(A->getType()); return Builder->CreateICmp(NEWCC, newAnd, zero); } if (mask & FoldMskICmp_BMask_AllOnes) { // (icmp eq (A & B), B) & (icmp eq (A & D), D) // -> (icmp eq (A & (B|D)), (B|D)) Value *newOr = Builder->CreateOr(B, D); Value *newAnd = Builder->CreateAnd(A, newOr); return Builder->CreateICmp(NEWCC, newAnd, newOr); } if (mask & FoldMskICmp_AMask_AllOnes) { // (icmp eq (A & B), A) & (icmp eq (A & D), A) // -> (icmp eq (A & (B&D)), A) Value *newAnd1 = Builder->CreateAnd(B, D); Value *newAnd = Builder->CreateAnd(A, newAnd1); return Builder->CreateICmp(NEWCC, newAnd, A); } // Remaining cases assume at least that B and D are constant, and depend on // their actual values. This isn't strictly, necessary, just a "handle the // easy cases for now" decision. ConstantInt *BCst = dyn_cast(B); if (!BCst) return nullptr; ConstantInt *DCst = dyn_cast(D); if (!DCst) return nullptr; if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) { // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and // (icmp ne (A & B), B) & (icmp ne (A & D), D) // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) // Only valid if one of the masks is a superset of the other (check "B&D" is // the same as either B or D). APInt NewMask = BCst->getValue() & DCst->getValue(); if (NewMask == BCst->getValue()) return LHS; else if (NewMask == DCst->getValue()) return RHS; } if (mask & FoldMskICmp_AMask_NotAllOnes) { // (icmp ne (A & B), B) & (icmp ne (A & D), D) // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) // Only valid if one of the masks is a superset of the other (check "B|D" is // the same as either B or D). APInt NewMask = BCst->getValue() | DCst->getValue(); if (NewMask == BCst->getValue()) return LHS; else if (NewMask == DCst->getValue()) return RHS; } if (mask & FoldMskICmp_BMask_Mixed) { // (icmp eq (A & B), C) & (icmp eq (A & D), E) // We already know that B & C == C && D & E == E. // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of // C and E, which are shared by both the mask B and the mask D, don't // contradict, then we can transform to // -> (icmp eq (A & (B|D)), (C|E)) // Currently, we only handle the case of B, C, D, and E being constant. // we can't simply use C and E, because we might actually handle // (icmp ne (A & B), B) & (icmp eq (A & D), D) // with B and D, having a single bit set ConstantInt *CCst = dyn_cast(C); if (!CCst) return nullptr; ConstantInt *ECst = dyn_cast(E); if (!ECst) return nullptr; if (LHSCC != NEWCC) CCst = cast(ConstantExpr::getXor(BCst, CCst)); if (RHSCC != NEWCC) ECst = cast(ConstantExpr::getXor(DCst, ECst)); // if there is a conflict we should actually return a false for the // whole construct if (((BCst->getValue() & DCst->getValue()) & (CCst->getValue() ^ ECst->getValue())) != 0) return ConstantInt::get(LHS->getType(), !IsAnd); Value *newOr1 = Builder->CreateOr(B, D); Value *newOr2 = ConstantExpr::getOr(CCst, ECst); Value *newAnd = Builder->CreateAnd(A, newOr1); return Builder->CreateICmp(NEWCC, newAnd, newOr2); } return nullptr; } /// Try to fold a signed range checked with lower bound 0 to an unsigned icmp. /// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n /// If \p Inverted is true then the check is for the inverted range, e.g. /// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted) { // Check the lower range comparison, e.g. x >= 0 // InstCombine already ensured that if there is a constant it's on the RHS. ConstantInt *RangeStart = dyn_cast(Cmp0->getOperand(1)); if (!RangeStart) return nullptr; ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() : Cmp0->getPredicate()); // Accept x > -1 or x >= 0 (after potentially inverting the predicate). if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) || (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero()))) return nullptr; ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() : Cmp1->getPredicate()); Value *Input = Cmp0->getOperand(0); Value *RangeEnd; if (Cmp1->getOperand(0) == Input) { // For the upper range compare we have: icmp x, n RangeEnd = Cmp1->getOperand(1); } else if (Cmp1->getOperand(1) == Input) { // For the upper range compare we have: icmp n, x RangeEnd = Cmp1->getOperand(0); Pred1 = ICmpInst::getSwappedPredicate(Pred1); } else { return nullptr; } // Check the upper range comparison, e.g. x < n ICmpInst::Predicate NewPred; switch (Pred1) { case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break; case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break; default: return nullptr; } // This simplification is only valid if the upper range is not negative. bool IsNegative, IsNotNegative; ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1); if (!IsNotNegative) return nullptr; if (Inverted) NewPred = ICmpInst::getInversePredicate(NewPred); return Builder->CreateICmp(NewPred, Input, RangeEnd); } /// Fold (icmp)&(icmp) if possible. Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B) if (PredicatesFoldable(LHSCC, RHSCC)) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) LHS->swapOperands(); if (LHS->getOperand(0) == RHS->getOperand(0) && LHS->getOperand(1) == RHS->getOperand(1)) { Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) & getICmpCode(RHS); bool isSigned = LHS->isSigned() || RHS->isSigned(); return getNewICmpValue(isSigned, Code, Op0, Op1, Builder); } } // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E) if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder)) return V; // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false)) return V; // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false)) return V; // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); ConstantInt *LHSCst = dyn_cast(LHS->getOperand(1)); ConstantInt *RHSCst = dyn_cast(RHS->getOperand(1)); if (!LHSCst || !RHSCst) return nullptr; if (LHSCst == RHSCst && LHSCC == RHSCC) { // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C) // where C is a power of 2 if (LHSCC == ICmpInst::ICMP_ULT && LHSCst->getValue().isPowerOf2()) { Value *NewOr = Builder->CreateOr(Val, Val2); return Builder->CreateICmp(LHSCC, NewOr, LHSCst); } // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) { Value *NewOr = Builder->CreateOr(Val, Val2); return Builder->CreateICmp(LHSCC, NewOr, LHSCst); } } // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2 // where CMAX is the all ones value for the truncated type, // iff the lower bits of C2 and CA are zero. if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC && LHS->hasOneUse() && RHS->hasOneUse()) { Value *V; ConstantInt *AndCst, *SmallCst = nullptr, *BigCst = nullptr; // (trunc x) == C1 & (and x, CA) == C2 // (and x, CA) == C2 & (trunc x) == C1 if (match(Val2, m_Trunc(m_Value(V))) && match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { SmallCst = RHSCst; BigCst = LHSCst; } else if (match(Val, m_Trunc(m_Value(V))) && match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { SmallCst = LHSCst; BigCst = RHSCst; } if (SmallCst && BigCst) { unsigned BigBitSize = BigCst->getType()->getBitWidth(); unsigned SmallBitSize = SmallCst->getType()->getBitWidth(); // Check that the low bits are zero. APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize); if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) { Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue()); APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue(); Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N); return Builder->CreateICmp(LHSCC, NewAnd, NewVal); } } } // From here on, we only handle: // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. if (Val != Val2) return nullptr; // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) return nullptr; // Make a constant range that's the intersection of the two icmp ranges. // If the intersection is empty, we know that the result is false. ConstantRange LHSRange = ConstantRange::makeAllowedICmpRegion(LHSCC, LHSCst->getValue()); ConstantRange RHSRange = ConstantRange::makeAllowedICmpRegion(RHSCC, RHSCst->getValue()); if (LHSRange.intersectWith(RHSRange).isEmptySet()) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); // We can't fold (ugt x, C) & (sgt x, C2). if (!PredicatesFoldable(LHSCC, RHSCC)) return nullptr; // Ensure that the larger constant is on the RHS. bool ShouldSwap; if (CmpInst::isSigned(LHSCC) || (ICmpInst::isEquality(LHSCC) && CmpInst::isSigned(RHSCC))) ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue()); else ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue()); if (ShouldSwap) { std::swap(LHS, RHS); std::swap(LHSCst, RHSCst); std::swap(LHSCC, RHSCC); } // At this point, we know we have two icmp instructions // comparing a value against two constants and and'ing the result // together. Because of the above check, we know that we only have // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know // (from the icmp folding check above), that the two constants // are not equal and that the larger constant is on the RHS assert(LHSCst != RHSCst && "Compares not folded above?"); switch (LHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_NE: // (X == 13 & X != 15) -> X == 13 case ICmpInst::ICMP_ULT: // (X == 13 & X < 15) -> X == 13 case ICmpInst::ICMP_SLT: // (X == 13 & X < 15) -> X == 13 return LHS; } case ICmpInst::ICMP_NE: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_ULT: if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13 return Builder->CreateICmpULT(Val, LHSCst); if (LHSCst->isNullValue()) // (X != 0 & X u< 14) -> X-1 u< 13 return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true); break; // (X != 13 & X u< 15) -> no change case ICmpInst::ICMP_SLT: if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13 return Builder->CreateICmpSLT(Val, LHSCst); break; // (X != 13 & X s< 15) -> no change case ICmpInst::ICMP_EQ: // (X != 13 & X == 15) -> X == 15 case ICmpInst::ICMP_UGT: // (X != 13 & X u> 15) -> X u> 15 case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15 return RHS; case ICmpInst::ICMP_NE: // Special case to get the ordering right when the values wrap around // zero. if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue()) std::swap(LHSCst, RHSCst); if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1 Constant *AddCST = ConstantExpr::getNeg(LHSCst); Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off"); return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1), Val->getName()+".cmp"); } break; // (X != 13 & X != 15) -> no change } break; case ICmpInst::ICMP_ULT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X u< 13 & X == 15) -> false case ICmpInst::ICMP_UGT: // (X u< 13 & X u> 15) -> false return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); case ICmpInst::ICMP_SGT: // (X u< 13 & X s> 15) -> no change break; case ICmpInst::ICMP_NE: // (X u< 13 & X != 15) -> X u< 13 case ICmpInst::ICMP_ULT: // (X u< 13 & X u< 15) -> X u< 13 return LHS; case ICmpInst::ICMP_SLT: // (X u< 13 & X s< 15) -> no change break; } break; case ICmpInst::ICMP_SLT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_UGT: // (X s< 13 & X u> 15) -> no change break; case ICmpInst::ICMP_NE: // (X s< 13 & X != 15) -> X < 13 case ICmpInst::ICMP_SLT: // (X s< 13 & X s< 15) -> X < 13 return LHS; case ICmpInst::ICMP_ULT: // (X s< 13 & X u< 15) -> no change break; } break; case ICmpInst::ICMP_UGT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X u> 13 & X == 15) -> X == 15 case ICmpInst::ICMP_UGT: // (X u> 13 & X u> 15) -> X u> 15 return RHS; case ICmpInst::ICMP_SGT: // (X u> 13 & X s> 15) -> no change break; case ICmpInst::ICMP_NE: if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14 return Builder->CreateICmp(LHSCC, Val, RHSCst); break; // (X u> 13 & X != 15) -> no change case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) 13 & X s< 15) -> no change break; } break; case ICmpInst::ICMP_SGT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X s> 13 & X == 15) -> X == 15 case ICmpInst::ICMP_SGT: // (X s> 13 & X s> 15) -> X s> 15 return RHS; case ICmpInst::ICMP_UGT: // (X s> 13 & X u> 15) -> no change break; case ICmpInst::ICMP_NE: if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14 return Builder->CreateICmp(LHSCC, Val, RHSCst); break; // (X s> 13 & X != 15) -> no change case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1 return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true); case ICmpInst::ICMP_ULT: // (X s> 13 & X u< 15) -> no change break; } break; } return nullptr; } /// Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of instcombine, this returns /// a Value which should already be inserted into the function. Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_ORD && RHS->getPredicate() == FCmpInst::FCMP_ORD) { if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return nullptr; // (fcmp ord x, c) & (fcmp ord y, c) -> (fcmp ord x, y) if (ConstantFP *LHSC = dyn_cast(LHS->getOperand(1))) if (ConstantFP *RHSC = dyn_cast(RHS->getOperand(1))) { // If either of the constants are nans, then the whole thing returns // false. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) return Builder->getFalse(); return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); } // Handle vector zeros. This occurs because the canonical form of // "fcmp ord x,x" is "fcmp ord x, 0". if (isa(LHS->getOperand(1)) && isa(RHS->getOperand(1))) return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); return nullptr; } Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1); FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate(); if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) { // Swap RHS operands to match LHS. Op1CC = FCmpInst::getSwappedPredicate(Op1CC); std::swap(Op1LHS, Op1RHS); } if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) { // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y). if (Op0CC == Op1CC) return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS); if (Op0CC == FCmpInst::FCMP_FALSE || Op1CC == FCmpInst::FCMP_FALSE) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); if (Op0CC == FCmpInst::FCMP_TRUE) return RHS; if (Op1CC == FCmpInst::FCMP_TRUE) return LHS; bool Op0Ordered; bool Op1Ordered; unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered); // uno && ord -> false if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); if (Op1Pred == 0) { std::swap(LHS, RHS); std::swap(Op0Pred, Op1Pred); std::swap(Op0Ordered, Op1Ordered); } if (Op0Pred == 0) { // uno && ueq -> uno && (uno || eq) -> uno // ord && olt -> ord && (ord && lt) -> olt if (!Op0Ordered && (Op0Ordered == Op1Ordered)) return LHS; if (Op0Ordered && (Op0Ordered == Op1Ordered)) return RHS; // uno && oeq -> uno && (ord && eq) -> false if (!Op0Ordered) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); // ord && ueq -> ord && (uno || eq) -> oeq return getFCmpValue(true, Op1Pred, Op0LHS, Op0RHS, Builder); } } return nullptr; } /// Match De Morgan's Laws: /// (~A & ~B) == (~(A | B)) /// (~A | ~B) == (~(A & B)) static Instruction *matchDeMorgansLaws(BinaryOperator &I, InstCombiner::BuilderTy *Builder) { auto Opcode = I.getOpcode(); assert((Opcode == Instruction::And || Opcode == Instruction::Or) && "Trying to match De Morgan's Laws with something other than and/or"); // Flip the logic operation. if (Opcode == Instruction::And) Opcode = Instruction::Or; else Opcode = Instruction::And; Value *Op0 = I.getOperand(0); Value *Op1 = I.getOperand(1); // TODO: Use pattern matchers instead of dyn_cast. if (Value *Op0NotVal = dyn_castNotVal(Op0)) if (Value *Op1NotVal = dyn_castNotVal(Op1)) if (Op0->hasOneUse() && Op1->hasOneUse()) { Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, I.getName() + ".demorgan"); return BinaryOperator::CreateNot(LogicOp); } // De Morgan's Law in disguise: // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B)) // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B)) Value *A = nullptr; Value *B = nullptr; ConstantInt *C1 = nullptr; if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) && match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) { // TODO: This check could be loosened to handle different type sizes. // Alternatively, we could fix the definition of m_Not to recognize a not // operation hidden by a zext? if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) && C1->isOne()) { Value *LogicOp = Builder->CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); Value *Not = Builder->CreateNot(LogicOp); return CastInst::CreateZExtOrBitCast(Not, I.getType()); } } return nullptr; } Instruction *InstCombiner::visitAnd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A|B)&(A|C) -> A|(B&C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; if (Value *V = SimplifyBSwap(I)) return ReplaceInstUsesWith(I, V); if (ConstantInt *AndRHS = dyn_cast(Op1)) { const APInt &AndRHSMask = AndRHS->getValue(); // Optimize a variety of ((val OP C1) & C2) combinations... if (BinaryOperator *Op0I = dyn_cast(Op0)) { Value *Op0LHS = Op0I->getOperand(0); Value *Op0RHS = Op0I->getOperand(1); switch (Op0I->getOpcode()) { default: break; case Instruction::Xor: case Instruction::Or: { // If the mask is only needed on one incoming arm, push it up. if (!Op0I->hasOneUse()) break; APInt NotAndRHS(~AndRHSMask); if (MaskedValueIsZero(Op0LHS, NotAndRHS, 0, &I)) { // Not masking anything out for the LHS, move to RHS. Value *NewRHS = Builder->CreateAnd(Op0RHS, AndRHS, Op0RHS->getName()+".masked"); return BinaryOperator::Create(Op0I->getOpcode(), Op0LHS, NewRHS); } if (!isa(Op0RHS) && MaskedValueIsZero(Op0RHS, NotAndRHS, 0, &I)) { // Not masking anything out for the RHS, move to LHS. Value *NewLHS = Builder->CreateAnd(Op0LHS, AndRHS, Op0LHS->getName()+".masked"); return BinaryOperator::Create(Op0I->getOpcode(), NewLHS, Op0RHS); } break; } case Instruction::Add: // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS. // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0 // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0 if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I)) return BinaryOperator::CreateAnd(V, AndRHS); if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I)) return BinaryOperator::CreateAnd(V, AndRHS); // Add commutes break; case Instruction::Sub: // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS. // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0 // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0 if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I)) return BinaryOperator::CreateAnd(V, AndRHS); // -x & 1 -> x & 1 if (AndRHSMask == 1 && match(Op0LHS, m_Zero())) return BinaryOperator::CreateAnd(Op0RHS, AndRHS); // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS // has 1's for all bits that the subtraction with A might affect. if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) { uint32_t BitWidth = AndRHSMask.getBitWidth(); uint32_t Zeros = AndRHSMask.countLeadingZeros(); APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros); if (MaskedValueIsZero(Op0LHS, Mask, 0, &I)) { Value *NewNeg = Builder->CreateNeg(Op0RHS); return BinaryOperator::CreateAnd(NewNeg, AndRHS); } } break; case Instruction::Shl: case Instruction::LShr: // (1 << x) & 1 --> zext(x == 0) // (1 >> x) & 1 --> zext(x == 0) if (AndRHSMask == 1 && Op0LHS == AndRHS) { Value *NewICmp = Builder->CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType())); return new ZExtInst(NewICmp, I.getType()); } break; } if (ConstantInt *Op0CI = dyn_cast(Op0I->getOperand(1))) if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I)) return Res; } // If this is an integer truncation, and if the source is an 'and' with // immediate, transform it. This frequently occurs for bitfield accesses. { Value *X = nullptr; ConstantInt *YC = nullptr; if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) { // Change: and (trunc (and X, YC) to T), C2 // into : and (trunc X to T), trunc(YC) & C2 // This will fold the two constants together, which may allow // other simplifications. Value *NewCast = Builder->CreateTrunc(X, I.getType(), "and.shrunk"); Constant *C3 = ConstantExpr::getTrunc(YC, I.getType()); C3 = ConstantExpr::getAnd(C3, AndRHS); return BinaryOperator::CreateAnd(NewCast, C3); } } // Try to fold constant and into select arguments. if (SelectInst *SI = dyn_cast(Op0)) if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; if (isa(Op0)) if (Instruction *NV = FoldOpIntoPhi(I)) return NV; } if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) return DeMorgan; { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; // (A|B) & ~(A&B) -> A^B if (match(Op0, m_Or(m_Value(A), m_Value(B))) && match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) && ((A == C && B == D) || (A == D && B == C))) return BinaryOperator::CreateXor(A, B); // ~(A&B) & (A|B) -> A^B if (match(Op1, m_Or(m_Value(A), m_Value(B))) && match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) && ((A == C && B == D) || (A == D && B == C))) return BinaryOperator::CreateXor(A, B); // A&(A^B) => A & ~B { Value *tmpOp0 = Op0; Value *tmpOp1 = Op1; if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_Value(B)))) { if (A == Op1 || B == Op1 ) { tmpOp1 = Op0; tmpOp0 = Op1; // Simplify below } } if (tmpOp1->hasOneUse() && match(tmpOp1, m_Xor(m_Value(A), m_Value(B)))) { if (B == tmpOp0) { std::swap(A, B); } // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if // A is originally -1 (or a vector of -1 and undefs), then we enter // an endless loop. By checking that A is non-constant we ensure that // we will never get to the loop. if (A == tmpOp0 && !isa(A)) // A&(A^B) -> A & ~B return BinaryOperator::CreateAnd(A, Builder->CreateNot(B)); } } // (A&((~A)|B)) -> A&B if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) || match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1))))) return BinaryOperator::CreateAnd(A, Op1); if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) || match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0))))) return BinaryOperator::CreateAnd(A, Op0); // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) if (Op1->hasOneUse() || cast(Op1)->hasOneUse()) return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(C)); // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) if (Op0->hasOneUse() || cast(Op0)->hasOneUse()) return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C)); // (A | B) & ((~A) ^ B) -> (A & B) if (match(Op0, m_Or(m_Value(A), m_Value(B))) && match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); // ((~A) ^ B) & (A | B) -> (A & B) if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op1, m_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); } { ICmpInst *LHS = dyn_cast(Op0); ICmpInst *RHS = dyn_cast(Op1); if (LHS && RHS) if (Value *Res = FoldAndOfICmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); // TODO: Make this recursive; it's a little tricky because an arbitrary // number of 'and' instructions might have to be created. Value *X, *Y; if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast(X)) if (Value *Res = FoldAndOfICmps(LHS, Cmp)) return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, Y)); if (auto *Cmp = dyn_cast(Y)) if (Value *Res = FoldAndOfICmps(LHS, Cmp)) return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, X)); } if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast(X)) if (Value *Res = FoldAndOfICmps(Cmp, RHS)) return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, Y)); if (auto *Cmp = dyn_cast(Y)) if (Value *Res = FoldAndOfICmps(Cmp, RHS)) return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, X)); } } // If and'ing two fcmp, try combine them into one. if (FCmpInst *LHS = dyn_cast(I.getOperand(0))) if (FCmpInst *RHS = dyn_cast(I.getOperand(1))) if (Value *Res = FoldAndOfFCmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); if (CastInst *Op0C = dyn_cast(Op0)) { Value *Op0COp = Op0C->getOperand(0); Type *SrcTy = Op0COp->getType(); // fold (and (cast A), (cast B)) -> (cast (and A, B)) if (CastInst *Op1C = dyn_cast(Op1)) { if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ? SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntOrIntVectorTy()) { Value *Op1COp = Op1C->getOperand(0); // Only do this if the casts both really cause code to be generated. if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) { Value *NewOp = Builder->CreateAnd(Op0COp, Op1COp, I.getName()); return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); } // If this is and(cast(icmp), cast(icmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (ICmpInst *RHS = dyn_cast(Op1COp)) if (ICmpInst *LHS = dyn_cast(Op0COp)) if (Value *Res = FoldAndOfICmps(LHS, RHS)) return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); // If this is and(cast(fcmp), cast(fcmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (FCmpInst *RHS = dyn_cast(Op1COp)) if (FCmpInst *LHS = dyn_cast(Op0COp)) if (Value *Res = FoldAndOfFCmps(LHS, RHS)) return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); } } // If we are masking off the sign bit of a floating-point value, convert // this to the canonical fabs intrinsic call and cast back to integer. // The backend should know how to optimize fabs(). // TODO: This transform should also apply to vectors. ConstantInt *CI; if (isa(Op0C) && SrcTy->isFloatingPointTy() && match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) { Module *M = I.getModule(); Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy); Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs"); return CastInst::CreateBitOrPointerCast(Call, I.getType()); } } { Value *X = nullptr; bool OpsSwapped = false; // Canonicalize SExt or Not to the LHS if (match(Op1, m_SExt(m_Value())) || match(Op1, m_Not(m_Value()))) { std::swap(Op0, Op1); OpsSwapped = true; } // Fold (and (sext bool to A), B) --> (select bool, B, 0) if (match(Op0, m_SExt(m_Value(X))) && X->getType()->getScalarType()->isIntegerTy(1)) { Value *Zero = Constant::getNullValue(Op1->getType()); return SelectInst::Create(X, Op1, Zero); } // Fold (and ~(sext bool to A), B) --> (select bool, 0, B) if (match(Op0, m_Not(m_SExt(m_Value(X)))) && X->getType()->getScalarType()->isIntegerTy(1)) { Value *Zero = Constant::getNullValue(Op0->getType()); return SelectInst::Create(X, Zero, Op1); } if (OpsSwapped) std::swap(Op0, Op1); } return Changed ? &I : nullptr; } - -/// Analyze the specified subexpression and see if it is capable of providing -/// pieces of a bswap or bitreverse. The subexpression provides a potential -/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in -/// the output of the expression came from a corresponding bit in some other -/// value. This function is recursive, and the end result is a mapping of -/// (value, bitnumber) to bitnumber. It is the caller's responsibility to -/// validate that all `value`s are identical and that the bitnumber to bitnumber -/// mapping is correct for a bswap or bitreverse. -/// -/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know -/// that the expression deposits the low byte of %X into the high byte of the -/// result and that all other bits are zero. This expression is accepted, -/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7]. -/// -/// This function returns true if the match was unsuccessful and false if so. -/// On entry to the function the "OverallLeftShift" is a signed integer value -/// indicating the number of bits that the subexpression is later shifted. For -/// example, if the expression is later right shifted by 16 bits, the -/// OverallLeftShift value would be -16 on entry. This is used to specify which -/// bits of BitValues are actually being set. -/// -/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding -/// bit is masked to zero by a user. For example, in (X & 255), X will be -/// processed with a bytemask of 255. BitMask is always in the local -/// (OverallLeftShift) coordinate space. -/// -static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask, - SmallVectorImpl &BitValues, - SmallVectorImpl &BitProvenance) { - if (Instruction *I = dyn_cast(V)) { - // If this is an or instruction, it may be an inner node of the bswap. - if (I->getOpcode() == Instruction::Or) - return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, - BitValues, BitProvenance) || - CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask, - BitValues, BitProvenance); - - // If this is a logical shift by a constant, recurse with OverallLeftShift - // and BitMask adjusted. - if (I->isLogicalShift() && isa(I->getOperand(1))) { - unsigned ShAmt = - cast(I->getOperand(1))->getLimitedValue(~0U); - // Ensure the shift amount is defined. - if (ShAmt > BitValues.size()) - return true; - - unsigned BitShift = ShAmt; - if (I->getOpcode() == Instruction::Shl) { - // X << C -> collect(X, +C) - OverallLeftShift += BitShift; - BitMask = BitMask.lshr(BitShift); - } else { - // X >>u C -> collect(X, -C) - OverallLeftShift -= BitShift; - BitMask = BitMask.shl(BitShift); - } - - if (OverallLeftShift >= (int)BitValues.size()) - return true; - if (OverallLeftShift <= -(int)BitValues.size()) - return true; - - return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, - BitValues, BitProvenance); - } - - // If this is a logical 'and' with a mask that clears bits, clear the - // corresponding bits in BitMask. - if (I->getOpcode() == Instruction::And && - isa(I->getOperand(1))) { - unsigned NumBits = BitValues.size(); - APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); - const APInt &AndMask = cast(I->getOperand(1))->getValue(); - - for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) { - // If this bit is masked out by a later operation, we don't care what - // the and mask is. - if (BitMask[i] == 0) - continue; - - // If the AndMask is zero for this bit, clear the bit. - APInt MaskB = AndMask & Bit; - if (MaskB == 0) { - BitMask.clearBit(i); - continue; - } - - // Otherwise, this bit is kept. - } - - return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, - BitValues, BitProvenance); - } - } - - // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be - // the input value to the bswap/bitreverse. To be part of a bswap or - // bitreverse we must be demanding a contiguous range of bits from it. - unsigned InputBitLen = BitMask.countPopulation(); - unsigned InputBitNo = BitMask.countTrailingZeros(); - if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo != - InputBitLen) - // Not a contiguous set range of bits! - return true; - - // We know we're moving a contiguous range of bits from the input to the - // output. Record which bits in the output came from which bits in the input. - unsigned DestBitNo = InputBitNo + OverallLeftShift; - for (unsigned I = 0; I < InputBitLen; ++I) - BitProvenance[DestBitNo + I] = InputBitNo + I; - - // If the destination bit value is already defined, the values are or'd - // together, which isn't a bswap/bitreverse (unless it's an or of the same - // bits). - if (BitValues[DestBitNo] && BitValues[DestBitNo] != V) - return true; - for (unsigned I = 0; I < InputBitLen; ++I) - BitValues[DestBitNo + I] = V; - - return false; -} - -static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, - unsigned BitWidth) { - if (From % 8 != To % 8) - return false; - // Convert from bit indices to byte indices and check for a byte reversal. - From >>= 3; - To >>= 3; - BitWidth >>= 3; - return From == BitWidth - To - 1; -} - -static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, - unsigned BitWidth) { - return From == BitWidth - To - 1; -} - /// Given an OR instruction, check to see if this is a bswap or bitreverse /// idiom. If so, insert the new intrinsic and return it. Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) { - IntegerType *ITy = dyn_cast(I.getType()); - if (!ITy) - return nullptr; // Can't do vectors. - unsigned BW = ITy->getBitWidth(); - - /// We keep track of which bit (BitProvenance) inside which value (BitValues) - /// defines each bit in the result. - SmallVector BitValues(BW, nullptr); - SmallVector BitProvenance(BW, -1); - - // Try to find all the pieces corresponding to the bswap. - APInt BitMask = APInt::getAllOnesValue(BitValues.size()); - if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance)) + SmallVector Insts; + if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts)) return nullptr; + Instruction *LastInst = Insts.pop_back_val(); + LastInst->removeFromParent(); - // Check to see if all of the bits come from the same value. - Value *V = BitValues[0]; - if (!V) return nullptr; // Didn't find a bit? Must be zero. - - if (!std::all_of(BitValues.begin(), BitValues.end(), - [&](const Value *X) { return X == V; })) - return nullptr; - - // Now, is the bit permutation correct for a bswap or a bitreverse? We can - // only byteswap values with an even number of bytes. - bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;; - for (unsigned i = 0, e = BitValues.size(); i != e; ++i) { - OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); - OKForBitReverse &= - bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); - } - - Intrinsic::ID Intrin; - if (OKForBSwap) - Intrin = Intrinsic::bswap; - else if (OKForBitReverse) - Intrin = Intrinsic::bitreverse; - else - return nullptr; - - Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy); - return CallInst::Create(F, V); + for (auto *Inst : Insts) + Worklist.Add(Inst); + return LastInst; } /// We have an expression of the form (A&C)|(B&D). Check if A is (cond?-1:0) /// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this /// expression to "cond ? C : D or B". static Instruction *MatchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D) { // If A is not a select of -1/0, this cannot match. Value *Cond = nullptr; if (!match(A, m_SExt(m_Value(Cond))) || !Cond->getType()->isIntegerTy(1)) return nullptr; // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B. if (match(D, m_Not(m_SExt(m_Specific(Cond))))) return SelectInst::Create(Cond, C, B); if (match(D, m_SExt(m_Not(m_Specific(Cond))))) return SelectInst::Create(Cond, C, B); // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D. if (match(B, m_Not(m_SExt(m_Specific(Cond))))) return SelectInst::Create(Cond, C, D); if (match(B, m_SExt(m_Not(m_Specific(Cond))))) return SelectInst::Create(Cond, C, D); return nullptr; } /// Fold (icmp)|(icmp) if possible. Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) // if K1 and K2 are a one-bit mask. ConstantInt *LHSCst = dyn_cast(LHS->getOperand(1)); ConstantInt *RHSCst = dyn_cast(RHS->getOperand(1)); if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() && RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) { BinaryOperator *LAnd = dyn_cast(LHS->getOperand(0)); BinaryOperator *RAnd = dyn_cast(RHS->getOperand(0)); if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() && LAnd->getOpcode() == Instruction::And && RAnd->getOpcode() == Instruction::And) { Value *Mask = nullptr; Value *Masked = nullptr; if (LAnd->getOperand(0) == RAnd->getOperand(0) && isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, AC, CxtI, DT) && isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, AC, CxtI, DT)) { Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, AC, CxtI, DT) && isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, AC, CxtI, DT)) { Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); } if (Masked) return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask); } } // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3) // --> (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3) // The original condition actually refers to the following two ranges: // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3] // We can fold these two ranges if: // 1) C1 and C2 is unsigned greater than C3. // 2) The two ranges are separated. // 3) C1 ^ C2 is one-bit mask. // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask. // This implies all values in the two ranges differ by exactly one bit. if ((LHSCC == ICmpInst::ICMP_ULT || LHSCC == ICmpInst::ICMP_ULE) && LHSCC == RHSCC && LHSCst && RHSCst && LHS->hasOneUse() && RHS->hasOneUse() && LHSCst->getType() == RHSCst->getType() && LHSCst->getValue() == (RHSCst->getValue())) { Value *LAdd = LHS->getOperand(0); Value *RAdd = RHS->getOperand(0); Value *LAddOpnd, *RAddOpnd; ConstantInt *LAddCst, *RAddCst; if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddCst))) && match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddCst))) && LAddCst->getValue().ugt(LHSCst->getValue()) && RAddCst->getValue().ugt(LHSCst->getValue())) { APInt DiffCst = LAddCst->getValue() ^ RAddCst->getValue(); if (LAddOpnd == RAddOpnd && DiffCst.isPowerOf2()) { ConstantInt *MaxAddCst = nullptr; if (LAddCst->getValue().ult(RAddCst->getValue())) MaxAddCst = RAddCst; else MaxAddCst = LAddCst; APInt RRangeLow = -RAddCst->getValue(); APInt RRangeHigh = RRangeLow + LHSCst->getValue(); APInt LRangeLow = -LAddCst->getValue(); APInt LRangeHigh = LRangeLow + LHSCst->getValue(); APInt LowRangeDiff = RRangeLow ^ LRangeLow; APInt HighRangeDiff = RRangeHigh ^ LRangeHigh; APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow : RRangeLow - LRangeLow; if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff && RangeDiff.ugt(LHSCst->getValue())) { Value *MaskCst = ConstantInt::get(LAddCst->getType(), ~DiffCst); Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskCst); Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddCst); return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSCst)); } } } } // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) if (PredicatesFoldable(LHSCC, RHSCC)) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) LHS->swapOperands(); if (LHS->getOperand(0) == RHS->getOperand(0) && LHS->getOperand(1) == RHS->getOperand(1)) { Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) | getICmpCode(RHS); bool isSigned = LHS->isSigned() || RHS->isSigned(); return getNewICmpValue(isSigned, Code, Op0, Op1, Builder); } } // handle (roughly): // (icmp ne (A & B), C) | (icmp ne (A & D), E) if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder)) return V; Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); if (LHS->hasOneUse() || RHS->hasOneUse()) { // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1) // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1) Value *A = nullptr, *B = nullptr; if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) { B = Val; if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1)) A = Val2; else if (RHSCC == ICmpInst::ICMP_UGT && Val == Val2) A = RHS->getOperand(1); } // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1) // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1) else if (RHSCC == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) { B = Val2; if (LHSCC == ICmpInst::ICMP_ULT && Val2 == LHS->getOperand(1)) A = Val; else if (LHSCC == ICmpInst::ICMP_UGT && Val2 == Val) A = LHS->getOperand(1); } if (A && B) return Builder->CreateICmp( ICmpInst::ICMP_UGE, Builder->CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A); } // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true)) return V; // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true)) return V; // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). if (!LHSCst || !RHSCst) return nullptr; if (LHSCst == RHSCst && LHSCC == RHSCC) { // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0) if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) { Value *NewOr = Builder->CreateOr(Val, Val2); return Builder->CreateICmp(LHSCC, NewOr, LHSCst); } } // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1) // iff C2 + CA == C1. if (LHSCC == ICmpInst::ICMP_ULT && RHSCC == ICmpInst::ICMP_EQ) { ConstantInt *AddCst; if (match(Val, m_Add(m_Specific(Val2), m_ConstantInt(AddCst)))) if (RHSCst->getValue() + AddCst->getValue() == LHSCst->getValue()) return Builder->CreateICmpULE(Val, LHSCst); } // From here on, we only handle: // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler. if (Val != Val2) return nullptr; // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) return nullptr; // We can't fold (ugt x, C) | (sgt x, C2). if (!PredicatesFoldable(LHSCC, RHSCC)) return nullptr; // Ensure that the larger constant is on the RHS. bool ShouldSwap; if (CmpInst::isSigned(LHSCC) || (ICmpInst::isEquality(LHSCC) && CmpInst::isSigned(RHSCC))) ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue()); else ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue()); if (ShouldSwap) { std::swap(LHS, RHS); std::swap(LHSCst, RHSCst); std::swap(LHSCC, RHSCC); } // At this point, we know we have two icmp instructions // comparing a value against two constants and or'ing the result // together. Because of the above check, we know that we only have // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the // icmp folding check above), that the two constants are not // equal. assert(LHSCst != RHSCst && "Compares not folded above?"); switch (LHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: if (LHS->getOperand(0) == RHS->getOperand(0)) { // if LHSCst and RHSCst differ only by one bit: // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2 assert(LHSCst->getValue().ule(LHSCst->getValue())); APInt Xor = LHSCst->getValue() ^ RHSCst->getValue(); if (Xor.isPowerOf2()) { Value *Cst = Builder->getInt(Xor); Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst); return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst); } } if (LHSCst == SubOne(RHSCst)) { // (X == 13 | X == 14) -> X-13 CreateAdd(Val, AddCST, Val->getName()+".off"); AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst); return Builder->CreateICmpULT(Add, AddCST); } break; // (X == 13 | X == 15) -> no change case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change break; case ICmpInst::ICMP_NE: // (X == 13 | X != 15) -> X != 15 case ICmpInst::ICMP_ULT: // (X == 13 | X u< 15) -> X u< 15 case ICmpInst::ICMP_SLT: // (X == 13 | X s< 15) -> X s< 15 return RHS; } break; case ICmpInst::ICMP_NE: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X != 13 | X == 15) -> X != 13 case ICmpInst::ICMP_UGT: // (X != 13 | X u> 15) -> X != 13 case ICmpInst::ICMP_SGT: // (X != 13 | X s> 15) -> X != 13 return LHS; case ICmpInst::ICMP_NE: // (X != 13 | X != 15) -> true case ICmpInst::ICMP_ULT: // (X != 13 | X u< 15) -> true case ICmpInst::ICMP_SLT: // (X != 13 | X s< 15) -> true return Builder->getTrue(); } case ICmpInst::ICMP_ULT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change break; case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2 // If RHSCst is [us]MAXINT, it is always false. Not handling // this can cause overflow. if (RHSCst->isMaxValue(false)) return LHS; return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false); case ICmpInst::ICMP_SGT: // (X u< 13 | X s> 15) -> no change break; case ICmpInst::ICMP_NE: // (X u< 13 | X != 15) -> X != 15 case ICmpInst::ICMP_ULT: // (X u< 13 | X u< 15) -> X u< 15 return RHS; case ICmpInst::ICMP_SLT: // (X u< 13 | X s< 15) -> no change break; } break; case ICmpInst::ICMP_SLT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change break; case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2 // If RHSCst is [us]MAXINT, it is always false. Not handling // this can cause overflow. if (RHSCst->isMaxValue(true)) return LHS; return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false); case ICmpInst::ICMP_UGT: // (X s< 13 | X u> 15) -> no change break; case ICmpInst::ICMP_NE: // (X s< 13 | X != 15) -> X != 15 case ICmpInst::ICMP_SLT: // (X s< 13 | X s< 15) -> X s< 15 return RHS; case ICmpInst::ICMP_ULT: // (X s< 13 | X u< 15) -> no change break; } break; case ICmpInst::ICMP_UGT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X u> 13 | X == 15) -> X u> 13 case ICmpInst::ICMP_UGT: // (X u> 13 | X u> 15) -> X u> 13 return LHS; case ICmpInst::ICMP_SGT: // (X u> 13 | X s> 15) -> no change break; case ICmpInst::ICMP_NE: // (X u> 13 | X != 15) -> true case ICmpInst::ICMP_ULT: // (X u> 13 | X u< 15) -> true return Builder->getTrue(); case ICmpInst::ICMP_SLT: // (X u> 13 | X s< 15) -> no change break; } break; case ICmpInst::ICMP_SGT: switch (RHSCC) { default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X s> 13 | X == 15) -> X > 13 case ICmpInst::ICMP_SGT: // (X s> 13 | X s> 15) -> X > 13 return LHS; case ICmpInst::ICMP_UGT: // (X s> 13 | X u> 15) -> no change break; case ICmpInst::ICMP_NE: // (X s> 13 | X != 15) -> true case ICmpInst::ICMP_SLT: // (X s> 13 | X s< 15) -> true return Builder->getTrue(); case ICmpInst::ICMP_ULT: // (X s> 13 | X u< 15) -> no change break; } break; } return nullptr; } /// Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of instcombine, this returns /// a Value which should already be inserted into the function. Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_UNO && RHS->getPredicate() == FCmpInst::FCMP_UNO && LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) { if (ConstantFP *LHSC = dyn_cast(LHS->getOperand(1))) if (ConstantFP *RHSC = dyn_cast(RHS->getOperand(1))) { // If either of the constants are nans, then the whole thing returns // true. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) return Builder->getTrue(); // Otherwise, no need to compare the two constants, compare the // rest. return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); } // Handle vector zeros. This occurs because the canonical form of // "fcmp uno x,x" is "fcmp uno x, 0". if (isa(LHS->getOperand(1)) && isa(RHS->getOperand(1))) return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); return nullptr; } Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1); FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate(); if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) { // Swap RHS operands to match LHS. Op1CC = FCmpInst::getSwappedPredicate(Op1CC); std::swap(Op1LHS, Op1RHS); } if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) { // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y). if (Op0CC == Op1CC) return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS); if (Op0CC == FCmpInst::FCMP_TRUE || Op1CC == FCmpInst::FCMP_TRUE) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); if (Op0CC == FCmpInst::FCMP_FALSE) return RHS; if (Op1CC == FCmpInst::FCMP_FALSE) return LHS; bool Op0Ordered; bool Op1Ordered; unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered); if (Op0Ordered == Op1Ordered) { // If both are ordered or unordered, return a new fcmp with // or'ed predicates. return getFCmpValue(Op0Ordered, Op0Pred|Op1Pred, Op0LHS, Op0RHS, Builder); } } return nullptr; } /// This helper function folds: /// /// ((A | B) & C1) | (B & C2) /// /// into: /// /// (A & C1) | B /// /// when the XOR of the two constants is "all ones" (-1). Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A, Value *B, Value *C) { ConstantInt *CI1 = dyn_cast(C); if (!CI1) return nullptr; Value *V1 = nullptr; ConstantInt *CI2 = nullptr; if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return nullptr; APInt Xor = CI1->getValue() ^ CI2->getValue(); if (!Xor.isAllOnesValue()) return nullptr; if (V1 == A || V1 == B) { Value *NewOp = Builder->CreateAnd((V1 == A) ? B : A, CI1); return BinaryOperator::CreateOr(NewOp, V1); } return nullptr; } /// \brief This helper function folds: /// /// ((A | B) & C1) ^ (B & C2) /// /// into: /// /// (A & C1) ^ B /// /// when the XOR of the two constants is "all ones" (-1). Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A, Value *B, Value *C) { ConstantInt *CI1 = dyn_cast(C); if (!CI1) return nullptr; Value *V1 = nullptr; ConstantInt *CI2 = nullptr; if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return nullptr; APInt Xor = CI1->getValue() ^ CI2->getValue(); if (!Xor.isAllOnesValue()) return nullptr; if (V1 == A || V1 == B) { Value *NewOp = Builder->CreateAnd(V1 == A ? B : A, CI1); return BinaryOperator::CreateXor(NewOp, V1); } return nullptr; } Instruction *InstCombiner::visitOr(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A&B)|(A&C) -> A&(B|C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; if (Value *V = SimplifyBSwap(I)) return ReplaceInstUsesWith(I, V); if (ConstantInt *RHS = dyn_cast(Op1)) { ConstantInt *C1 = nullptr; Value *X = nullptr; // (X & C1) | C2 --> (X | C2) & (C1|C2) // iff (C1 & C2) == 0. if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && (RHS->getValue() & C1->getValue()) != 0 && Op0->hasOneUse()) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); return BinaryOperator::CreateAnd(Or, Builder->getInt(RHS->getValue() | C1->getValue())); } // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && Op0->hasOneUse()) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); return BinaryOperator::CreateXor(Or, Builder->getInt(C1->getValue() & ~RHS->getValue())); } // Try to fold constant and into select arguments. if (SelectInst *SI = dyn_cast(Op0)) if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; if (isa(Op0)) if (Instruction *NV = FoldOpIntoPhi(I)) return NV; } Value *A = nullptr, *B = nullptr; ConstantInt *C1 = nullptr, *C2 = nullptr; // (A | B) | C and A | (B | C) -> bswap if possible. bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) || match(Op1, m_Or(m_Value(), m_Value())); // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) && match(Op1, m_LogicalShift(m_Value(), m_Value())); // (A & B) | (C & D) -> bswap if possible. bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) && match(Op1, m_And(m_Value(), m_Value())); if (OrOfOrs || OrOfShifts || OrOfAnds) if (Instruction *BSwap = MatchBSwapOrBitReverse(I)) return BSwap; // (X^C)|Y -> (X|Y)^C iff Y&C == 0 if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) && MaskedValueIsZero(Op1, C1->getValue(), 0, &I)) { Value *NOr = Builder->CreateOr(A, Op1); NOr->takeName(Op0); return BinaryOperator::CreateXor(NOr, C1); } // Y|(X^C) -> (X|Y)^C iff Y&C == 0 if (Op1->hasOneUse() && match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) && MaskedValueIsZero(Op0, C1->getValue(), 0, &I)) { Value *NOr = Builder->CreateOr(A, Op0); NOr->takeName(Op0); return BinaryOperator::CreateXor(NOr, C1); } // ((~A & B) | A) -> (A | B) if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) && match(Op1, m_Specific(A))) return BinaryOperator::CreateOr(A, B); // ((A & B) | ~A) -> (~A | B) if (match(Op0, m_And(m_Value(A), m_Value(B))) && match(Op1, m_Not(m_Specific(A)))) return BinaryOperator::CreateOr(Builder->CreateNot(A), B); // (A & (~B)) | (A ^ B) -> (A ^ B) if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && match(Op1, m_Xor(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateXor(A, B); // (A ^ B) | ( A & (~B)) -> (A ^ B) if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B))))) return BinaryOperator::CreateXor(A, B); // (A & C)|(B & D) Value *C = nullptr, *D = nullptr; if (match(Op0, m_And(m_Value(A), m_Value(C))) && match(Op1, m_And(m_Value(B), m_Value(D)))) { Value *V1 = nullptr, *V2 = nullptr; C1 = dyn_cast(C); C2 = dyn_cast(D); if (C1 && C2) { // (A & C1)|(B & C2) if ((C1->getValue() & C2->getValue()) == 0) { // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) // iff (C1&C2) == 0 and (N&~C1) == 0 if (match(A, m_Or(m_Value(V1), m_Value(V2))) && ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N) (V2 == B && MaskedValueIsZero(V1, ~C1->getValue(), 0, &I)))) // (N|V) return BinaryOperator::CreateAnd(A, Builder->getInt(C1->getValue()|C2->getValue())); // Or commutes, try both ways. if (match(B, m_Or(m_Value(V1), m_Value(V2))) && ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N) (V2 == A && MaskedValueIsZero(V1, ~C2->getValue(), 0, &I)))) // (N|V) return BinaryOperator::CreateAnd(B, Builder->getInt(C1->getValue()|C2->getValue())); // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. ConstantInt *C3 = nullptr, *C4 = nullptr; if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) && (C3->getValue() & ~C1->getValue()) == 0 && match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) && (C4->getValue() & ~C2->getValue()) == 0) { V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield"); return BinaryOperator::CreateAnd(V2, Builder->getInt(C1->getValue()|C2->getValue())); } } } // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) -> C0 ? A : B, and commuted variants. // Don't do this for vector select idioms, the code generator doesn't handle // them well yet. if (!I.getType()->isVectorTy()) { if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D)) return Match; if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C)) return Match; if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D)) return Match; if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C)) return Match; } // ((A&~B)|(~A&B)) -> A^B if ((match(C, m_Not(m_Specific(D))) && match(B, m_Not(m_Specific(A))))) return BinaryOperator::CreateXor(A, D); // ((~B&A)|(~A&B)) -> A^B if ((match(A, m_Not(m_Specific(D))) && match(B, m_Not(m_Specific(C))))) return BinaryOperator::CreateXor(C, D); // ((A&~B)|(B&~A)) -> A^B if ((match(C, m_Not(m_Specific(B))) && match(D, m_Not(m_Specific(A))))) return BinaryOperator::CreateXor(A, B); // ((~B&A)|(B&~A)) -> A^B if ((match(A, m_Not(m_Specific(B))) && match(D, m_Not(m_Specific(C))))) return BinaryOperator::CreateXor(C, B); // ((A|B)&1)|(B&-2) -> (A&1) | B if (match(A, m_Or(m_Value(V1), m_Specific(B))) || match(A, m_Or(m_Specific(B), m_Value(V1)))) { Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C); if (Ret) return Ret; } // (B&-2)|((A|B)&1) -> (A&1) | B if (match(B, m_Or(m_Specific(A), m_Value(V1))) || match(B, m_Or(m_Value(V1), m_Specific(A)))) { Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D); if (Ret) return Ret; } // ((A^B)&1)|(B&-2) -> (A&1) ^ B if (match(A, m_Xor(m_Value(V1), m_Specific(B))) || match(A, m_Xor(m_Specific(B), m_Value(V1)))) { Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C); if (Ret) return Ret; } // (B&-2)|((A^B)&1) -> (A&1) ^ B if (match(B, m_Xor(m_Specific(A), m_Value(V1))) || match(B, m_Xor(m_Value(V1), m_Specific(A)))) { Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D); if (Ret) return Ret; } } // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) if (Op1->hasOneUse() || cast(Op1)->hasOneUse()) return BinaryOperator::CreateOr(Op0, C); // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) if (Op0->hasOneUse() || cast(Op0)->hasOneUse()) return BinaryOperator::CreateOr(Op1, C); // ((B | C) & A) | B -> B | (A & C) if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) return DeMorgan; // Canonicalize xor to the RHS. bool SwappedForXor = false; if (match(Op0, m_Xor(m_Value(), m_Value()))) { std::swap(Op0, Op1); SwappedForXor = true; } // A | ( A ^ B) -> A | B // A | (~A ^ B) -> A | ~B // (A & B) | (A ^ B) if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) { if (Op0 == A || Op0 == B) return BinaryOperator::CreateOr(A, B); if (match(Op0, m_And(m_Specific(A), m_Specific(B))) || match(Op0, m_And(m_Specific(B), m_Specific(A)))) return BinaryOperator::CreateOr(A, B); if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) { Value *Not = Builder->CreateNot(B, B->getName()+".not"); return BinaryOperator::CreateOr(Not, Op0); } if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) { Value *Not = Builder->CreateNot(A, A->getName()+".not"); return BinaryOperator::CreateOr(Not, Op0); } } // A | ~(A | B) -> A | ~B // A | ~(A ^ B) -> A | ~B if (match(Op1, m_Not(m_Value(A)))) if (BinaryOperator *B = dyn_cast(A)) if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) && Op1->hasOneUse() && (B->getOpcode() == Instruction::Or || B->getOpcode() == Instruction::Xor)) { Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) : B->getOperand(0); Value *Not = Builder->CreateNot(NotOp, NotOp->getName()+".not"); return BinaryOperator::CreateOr(Not, Op0); } // (A & B) | ((~A) ^ B) -> (~A ^ B) if (match(Op0, m_And(m_Value(A), m_Value(B))) && match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B)))) return BinaryOperator::CreateXor(Builder->CreateNot(A), B); // ((~A) ^ B) | (A & B) -> (~A ^ B) if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op1, m_And(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateXor(Builder->CreateNot(A), B); if (SwappedForXor) std::swap(Op0, Op1); { ICmpInst *LHS = dyn_cast(Op0); ICmpInst *RHS = dyn_cast(Op1); if (LHS && RHS) if (Value *Res = FoldOrOfICmps(LHS, RHS, &I)) return ReplaceInstUsesWith(I, Res); // TODO: Make this recursive; it's a little tricky because an arbitrary // number of 'or' instructions might have to be created. Value *X, *Y; if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast(X)) if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I)) return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y)); if (auto *Cmp = dyn_cast(Y)) if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I)) return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X)); } if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast(X)) if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I)) return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y)); if (auto *Cmp = dyn_cast(Y)) if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I)) return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X)); } } // (fcmp uno x, c) | (fcmp uno y, c) -> (fcmp uno x, y) if (FCmpInst *LHS = dyn_cast(I.getOperand(0))) if (FCmpInst *RHS = dyn_cast(I.getOperand(1))) if (Value *Res = FoldOrOfFCmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); // fold (or (cast A), (cast B)) -> (cast (or A, B)) if (CastInst *Op0C = dyn_cast(Op0)) { CastInst *Op1C = dyn_cast(Op1); if (Op1C && Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ? Type *SrcTy = Op0C->getOperand(0)->getType(); if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntOrIntVectorTy()) { Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); if ((!isa(Op0COp) || !isa(Op1COp)) && // Only do this if the casts both really cause code to be // generated. ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) { Value *NewOp = Builder->CreateOr(Op0COp, Op1COp, I.getName()); return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); } // If this is or(cast(icmp), cast(icmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (ICmpInst *RHS = dyn_cast(Op1COp)) if (ICmpInst *LHS = dyn_cast(Op0COp)) if (Value *Res = FoldOrOfICmps(LHS, RHS, &I)) return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (FCmpInst *RHS = dyn_cast(Op1COp)) if (FCmpInst *LHS = dyn_cast(Op0COp)) if (Value *Res = FoldOrOfFCmps(LHS, RHS)) return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); } } } // or(sext(A), B) -> A ? -1 : B where A is an i1 // or(A, sext(B)) -> B ? -1 : A where B is an i1 if (match(Op0, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1)) return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1); if (match(Op1, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1)) return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0); // Note: If we've gotten to the point of visiting the outer OR, then the // inner one couldn't be simplified. If it was a constant, then it won't // be simplified by a later pass either, so we try swapping the inner/outer // ORs in the hopes that we'll be able to simplify it this way. // (X|C) | V --> (X|V) | C if (Op0->hasOneUse() && !isa(Op1) && match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) { Value *Inner = Builder->CreateOr(A, Op1); Inner->takeName(Op0); return BinaryOperator::CreateOr(Inner, C1); } // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D)) // Since this OR statement hasn't been optimized further yet, we hope // that this transformation will allow the new ORs to be optimized. { Value *X = nullptr, *Y = nullptr; if (Op0->hasOneUse() && Op1->hasOneUse() && match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) && match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) { Value *orTrue = Builder->CreateOr(A, C); Value *orFalse = Builder->CreateOr(B, D); return SelectInst::Create(X, orTrue, orFalse); } } return Changed ? &I : nullptr; } Instruction *InstCombiner::visitXor(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A&B)^(A&C) -> A&(B^C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; if (Value *V = SimplifyBSwap(I)) return ReplaceInstUsesWith(I, V); // Is this a ~ operation? if (Value *NotOp = dyn_castNotVal(&I)) { if (BinaryOperator *Op0I = dyn_cast(NotOp)) { if (Op0I->getOpcode() == Instruction::And || Op0I->getOpcode() == Instruction::Or) { // ~(~X & Y) --> (X | ~Y) - De Morgan's Law // ~(~X | Y) === (X & ~Y) - De Morgan's Law if (dyn_castNotVal(Op0I->getOperand(1))) Op0I->swapOperands(); if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) { Value *NotY = Builder->CreateNot(Op0I->getOperand(1), Op0I->getOperand(1)->getName()+".not"); if (Op0I->getOpcode() == Instruction::And) return BinaryOperator::CreateOr(Op0NotVal, NotY); return BinaryOperator::CreateAnd(Op0NotVal, NotY); } // ~(X & Y) --> (~X | ~Y) - De Morgan's Law // ~(X | Y) === (~X & ~Y) - De Morgan's Law if (IsFreeToInvert(Op0I->getOperand(0), Op0I->getOperand(0)->hasOneUse()) && IsFreeToInvert(Op0I->getOperand(1), Op0I->getOperand(1)->hasOneUse())) { Value *NotX = Builder->CreateNot(Op0I->getOperand(0), "notlhs"); Value *NotY = Builder->CreateNot(Op0I->getOperand(1), "notrhs"); if (Op0I->getOpcode() == Instruction::And) return BinaryOperator::CreateOr(NotX, NotY); return BinaryOperator::CreateAnd(NotX, NotY); } } else if (Op0I->getOpcode() == Instruction::AShr) { // ~(~X >>s Y) --> (X >>s Y) if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) return BinaryOperator::CreateAShr(Op0NotVal, Op0I->getOperand(1)); } } } if (Constant *RHS = dyn_cast(Op1)) { if (RHS->isAllOnesValue() && Op0->hasOneUse()) // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B if (CmpInst *CI = dyn_cast(Op0)) return CmpInst::Create(CI->getOpcode(), CI->getInversePredicate(), CI->getOperand(0), CI->getOperand(1)); } if (ConstantInt *RHS = dyn_cast(Op1)) { // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp). if (CastInst *Op0C = dyn_cast(Op0)) { if (CmpInst *CI = dyn_cast(Op0C->getOperand(0))) { if (CI->hasOneUse() && Op0C->hasOneUse()) { Instruction::CastOps Opcode = Op0C->getOpcode(); if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(), Op0C->getDestTy()))) { CI->setPredicate(CI->getInversePredicate()); return CastInst::Create(Opcode, CI, Op0C->getType()); } } } } if (BinaryOperator *Op0I = dyn_cast(Op0)) { // ~(c-X) == X-c-1 == X+(-c-1) if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue()) if (Constant *Op0I0C = dyn_cast(Op0I->getOperand(0))) { Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C); Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C, ConstantInt::get(I.getType(), 1)); return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS); } if (ConstantInt *Op0CI = dyn_cast(Op0I->getOperand(1))) { if (Op0I->getOpcode() == Instruction::Add) { // ~(X-c) --> (-c-1)-X if (RHS->isAllOnesValue()) { Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI); return BinaryOperator::CreateSub( ConstantExpr::getSub(NegOp0CI, ConstantInt::get(I.getType(), 1)), Op0I->getOperand(0)); } else if (RHS->getValue().isSignBit()) { // (X + C) ^ signbit -> (X + C + signbit) Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue()); return BinaryOperator::CreateAdd(Op0I->getOperand(0), C); } } else if (Op0I->getOpcode() == Instruction::Or) { // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0 if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue(), 0, &I)) { Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS); // Anything in both C1 and C2 is known to be zero, remove it from // NewRHS. Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS); NewRHS = ConstantExpr::getAnd(NewRHS, ConstantExpr::getNot(CommonBits)); Worklist.Add(Op0I); I.setOperand(0, Op0I->getOperand(0)); I.setOperand(1, NewRHS); return &I; } } else if (Op0I->getOpcode() == Instruction::LShr) { // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3) // E1 = "X ^ C1" BinaryOperator *E1; ConstantInt *C1; if (Op0I->hasOneUse() && (E1 = dyn_cast(Op0I->getOperand(0))) && E1->getOpcode() == Instruction::Xor && (C1 = dyn_cast(E1->getOperand(1)))) { // fold (C1 >> C2) ^ C3 ConstantInt *C2 = Op0CI, *C3 = RHS; APInt FoldConst = C1->getValue().lshr(C2->getValue()); FoldConst ^= C3->getValue(); // Prepare the two operands. Value *Opnd0 = Builder->CreateLShr(E1->getOperand(0), C2); Opnd0->takeName(Op0I); cast(Opnd0)->setDebugLoc(I.getDebugLoc()); Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst); return BinaryOperator::CreateXor(Opnd0, FoldVal); } } } } // Try to fold constant and into select arguments. if (SelectInst *SI = dyn_cast(Op0)) if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; if (isa(Op0)) if (Instruction *NV = FoldOpIntoPhi(I)) return NV; } BinaryOperator *Op1I = dyn_cast(Op1); if (Op1I) { Value *A, *B; if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) { if (A == Op0) { // B^(B|A) == (A|B)^B Op1I->swapOperands(); I.swapOperands(); std::swap(Op0, Op1); } else if (B == Op0) { // B^(A|B) == (A|B)^B I.swapOperands(); // Simplified below. std::swap(Op0, Op1); } } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && Op1I->hasOneUse()){ if (A == Op0) { // A^(A&B) -> A^(B&A) Op1I->swapOperands(); std::swap(A, B); } if (B == Op0) { // A^(B&A) -> (B&A)^A I.swapOperands(); // Simplified below. std::swap(Op0, Op1); } } } BinaryOperator *Op0I = dyn_cast(Op0); if (Op0I) { Value *A, *B; if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && Op0I->hasOneUse()) { if (A == Op1) // (B|A)^B == (A|B)^B std::swap(A, B); if (B == Op1) // (A|B)^B == A & ~B return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1)); } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && Op0I->hasOneUse()){ if (A == Op1) // (A&B)^A -> (B&A)^A std::swap(A, B); if (B == Op1 && // (B&A)^A == ~B & A !isa(Op1)) { // Canonical form is (B&C)^C return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1); } } } if (Op0I && Op1I) { Value *A, *B, *C, *D; // (A & B)^(A | B) -> A ^ B if (match(Op0I, m_And(m_Value(A), m_Value(B))) && match(Op1I, m_Or(m_Value(C), m_Value(D)))) { if ((A == C && B == D) || (A == D && B == C)) return BinaryOperator::CreateXor(A, B); } // (A | B)^(A & B) -> A ^ B if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && match(Op1I, m_And(m_Value(C), m_Value(D)))) { if ((A == C && B == D) || (A == D && B == C)) return BinaryOperator::CreateXor(A, B); } // (A | ~B) ^ (~A | B) -> A ^ B if (match(Op0I, m_Or(m_Value(A), m_Not(m_Value(B)))) && match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) { return BinaryOperator::CreateXor(A, B); } // (~A | B) ^ (A | ~B) -> A ^ B if (match(Op0I, m_Or(m_Not(m_Value(A)), m_Value(B))) && match(Op1I, m_Or(m_Specific(A), m_Not(m_Specific(B))))) { return BinaryOperator::CreateXor(A, B); } // (A & ~B) ^ (~A & B) -> A ^ B if (match(Op0I, m_And(m_Value(A), m_Not(m_Value(B)))) && match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B)))) { return BinaryOperator::CreateXor(A, B); } // (~A & B) ^ (A & ~B) -> A ^ B if (match(Op0I, m_And(m_Not(m_Value(A)), m_Value(B))) && match(Op1I, m_And(m_Specific(A), m_Not(m_Specific(B))))) { return BinaryOperator::CreateXor(A, B); } // (A ^ C)^(A | B) -> ((~A) & B) ^ C if (match(Op0I, m_Xor(m_Value(D), m_Value(C))) && match(Op1I, m_Or(m_Value(A), m_Value(B)))) { if (D == A) return BinaryOperator::CreateXor( Builder->CreateAnd(Builder->CreateNot(A), B), C); if (D == B) return BinaryOperator::CreateXor( Builder->CreateAnd(Builder->CreateNot(B), A), C); } // (A | B)^(A ^ C) -> ((~A) & B) ^ C if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && match(Op1I, m_Xor(m_Value(D), m_Value(C)))) { if (D == A) return BinaryOperator::CreateXor( Builder->CreateAnd(Builder->CreateNot(A), B), C); if (D == B) return BinaryOperator::CreateXor( Builder->CreateAnd(Builder->CreateNot(B), A), C); } // (A & B) ^ (A ^ B) -> (A | B) if (match(Op0I, m_And(m_Value(A), m_Value(B))) && match(Op1I, m_Xor(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateOr(A, B); // (A ^ B) ^ (A & B) -> (A | B) if (match(Op0I, m_Xor(m_Value(A), m_Value(B))) && match(Op1I, m_And(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateOr(A, B); } Value *A = nullptr, *B = nullptr; // (A & ~B) ^ (~A) -> ~(A & B) if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && match(Op1, m_Not(m_Specific(A)))) return BinaryOperator::CreateNot(Builder->CreateAnd(A, B)); // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) if (ICmpInst *RHS = dyn_cast(I.getOperand(1))) if (ICmpInst *LHS = dyn_cast(I.getOperand(0))) if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) LHS->swapOperands(); if (LHS->getOperand(0) == RHS->getOperand(0) && LHS->getOperand(1) == RHS->getOperand(1)) { Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS); bool isSigned = LHS->isSigned() || RHS->isSigned(); return ReplaceInstUsesWith(I, getNewICmpValue(isSigned, Code, Op0, Op1, Builder)); } } // fold (xor (cast A), (cast B)) -> (cast (xor A, B)) if (CastInst *Op0C = dyn_cast(Op0)) { if (CastInst *Op1C = dyn_cast(Op1)) if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind? Type *SrcTy = Op0C->getOperand(0)->getType(); if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() && // Only do this if the casts both really cause code to be generated. ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), I.getType()) && ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0), I.getType())) { Value *NewOp = Builder->CreateXor(Op0C->getOperand(0), Op1C->getOperand(0), I.getName()); return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); } } } return Changed ? &I : nullptr; } Index: projects/clang380-import/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp (revision 294609) @@ -1,1714 +1,2011 @@ //===- InlineFunction.cpp - Code to perform function inlining -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements inlining of a function into a call site, resolving // parameters and the return value as appropriate. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/CommandLine.h" #include using namespace llvm; static cl::opt EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true), cl::Hidden, cl::desc("Convert noalias attributes to metadata during inlining.")); static cl::opt PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining", cl::init(true), cl::Hidden, cl::desc("Convert align attributes to assumptions during inlining.")); bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, AAResults *CalleeAAR, bool InsertLifetime) { return InlineFunction(CallSite(CI), IFI, CalleeAAR, InsertLifetime); } bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, AAResults *CalleeAAR, bool InsertLifetime) { return InlineFunction(CallSite(II), IFI, CalleeAAR, InsertLifetime); } namespace { /// A class for recording information about inlining a landing pad. class LandingPadInliningInfo { BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind. BasicBlock *InnerResumeDest; ///< Destination for the callee's resume. LandingPadInst *CallerLPad; ///< LandingPadInst associated with the invoke. PHINode *InnerEHValuesPHI; ///< PHI for EH values from landingpad insts. SmallVector UnwindDestPHIValues; public: LandingPadInliningInfo(InvokeInst *II) : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr), CallerLPad(nullptr), InnerEHValuesPHI(nullptr) { // If there are PHI nodes in the unwind destination block, we need to keep // track of which values came into them from the invoke before removing // the edge from this block. llvm::BasicBlock *InvokeBB = II->getParent(); BasicBlock::iterator I = OuterResumeDest->begin(); for (; isa(I); ++I) { // Save the value to use for this edge. PHINode *PHI = cast(I); UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); } CallerLPad = cast(I); } /// The outer unwind destination is the target of /// unwind edges introduced for calls within the inlined function. BasicBlock *getOuterResumeDest() const { return OuterResumeDest; } BasicBlock *getInnerResumeDest(); LandingPadInst *getLandingPadInst() const { return CallerLPad; } /// Forward the 'resume' instruction to the caller's landing pad block. /// When the landing pad block has only one predecessor, this is /// a simple branch. When there is more than one predecessor, we need to /// split the landing pad block after the landingpad instruction and jump /// to there. void forwardResume(ResumeInst *RI, SmallPtrSetImpl &InlinedLPads); /// Add incoming-PHI values to the unwind destination block for the given /// basic block, using the values for the original invoke's source block. void addIncomingPHIValuesFor(BasicBlock *BB) const { addIncomingPHIValuesForInto(BB, OuterResumeDest); } void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const { BasicBlock::iterator I = dest->begin(); for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { PHINode *phi = cast(I); phi->addIncoming(UnwindDestPHIValues[i], src); } } }; } // anonymous namespace /// Get or create a target for the branch from ResumeInsts. BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { if (InnerResumeDest) return InnerResumeDest; // Split the landing pad. BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator(); InnerResumeDest = OuterResumeDest->splitBasicBlock(SplitPoint, OuterResumeDest->getName() + ".body"); // The number of incoming edges we expect to the inner landing pad. const unsigned PHICapacity = 2; // Create corresponding new PHIs for all the PHIs in the outer landing pad. Instruction *InsertPoint = &InnerResumeDest->front(); BasicBlock::iterator I = OuterResumeDest->begin(); for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { PHINode *OuterPHI = cast(I); PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity, OuterPHI->getName() + ".lpad-body", InsertPoint); OuterPHI->replaceAllUsesWith(InnerPHI); InnerPHI->addIncoming(OuterPHI, OuterResumeDest); } // Create a PHI for the exception values. InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity, "eh.lpad-body", InsertPoint); CallerLPad->replaceAllUsesWith(InnerEHValuesPHI); InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest); // All done. return InnerResumeDest; } /// Forward the 'resume' instruction to the caller's landing pad block. /// When the landing pad block has only one predecessor, this is a simple /// branch. When there is more than one predecessor, we need to split the /// landing pad block after the landingpad instruction and jump to there. void LandingPadInliningInfo::forwardResume( ResumeInst *RI, SmallPtrSetImpl &InlinedLPads) { BasicBlock *Dest = getInnerResumeDest(); BasicBlock *Src = RI->getParent(); BranchInst::Create(Dest, Src); // Update the PHIs in the destination. They were inserted in an order which // makes this work. addIncomingPHIValuesForInto(Src, Dest); InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src); RI->eraseFromParent(); } +/// Helper for getUnwindDestToken/getUnwindDestTokenHelper. +static Value *getParentPad(Value *EHPad) { + if (auto *FPI = dyn_cast(EHPad)) + return FPI->getParentPad(); + return cast(EHPad)->getParentPad(); +} + +typedef DenseMap UnwindDestMemoTy; + +/// Helper for getUnwindDestToken that does the descendant-ward part of +/// the search. +static Value *getUnwindDestTokenHelper(Instruction *EHPad, + UnwindDestMemoTy &MemoMap) { + SmallVector Worklist(1, EHPad); + + while (!Worklist.empty()) { + Instruction *CurrentPad = Worklist.pop_back_val(); + // We only put pads on the worklist that aren't in the MemoMap. When + // we find an unwind dest for a pad we may update its ancestors, but + // the queue only ever contains uncles/great-uncles/etc. of CurrentPad, + // so they should never get updated while queued on the worklist. + assert(!MemoMap.count(CurrentPad)); + Value *UnwindDestToken = nullptr; + if (auto *CatchSwitch = dyn_cast(CurrentPad)) { + if (CatchSwitch->hasUnwindDest()) { + UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI(); + } else { + // Catchswitch doesn't have a 'nounwind' variant, and one might be + // annotated as "unwinds to caller" when really it's nounwind (see + // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the + // parent's unwind dest from this. We can check its catchpads' + // descendants, since they might include a cleanuppad with an + // "unwinds to caller" cleanupret, which can be trusted. + for (auto HI = CatchSwitch->handler_begin(), + HE = CatchSwitch->handler_end(); + HI != HE && !UnwindDestToken; ++HI) { + BasicBlock *HandlerBlock = *HI; + auto *CatchPad = cast(HandlerBlock->getFirstNonPHI()); + for (User *Child : CatchPad->users()) { + // Intentionally ignore invokes here -- since the catchswitch is + // marked "unwind to caller", it would be a verifier error if it + // contained an invoke which unwinds out of it, so any invoke we'd + // encounter must unwind to some child of the catch. + if (!isa(Child) && !isa(Child)) + continue; + + Instruction *ChildPad = cast(Child); + auto Memo = MemoMap.find(ChildPad); + if (Memo == MemoMap.end()) { + // Haven't figure out this child pad yet; queue it. + Worklist.push_back(ChildPad); + continue; + } + // We've already checked this child, but might have found that + // it offers no proof either way. + Value *ChildUnwindDestToken = Memo->second; + if (!ChildUnwindDestToken) + continue; + // We already know the child's unwind dest, which can either + // be ConstantTokenNone to indicate unwind to caller, or can + // be another child of the catchpad. Only the former indicates + // the unwind dest of the catchswitch. + if (isa(ChildUnwindDestToken)) { + UnwindDestToken = ChildUnwindDestToken; + break; + } + assert(getParentPad(ChildUnwindDestToken) == CatchPad); + } + } + } + } else { + auto *CleanupPad = cast(CurrentPad); + for (User *U : CleanupPad->users()) { + if (auto *CleanupRet = dyn_cast(U)) { + if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest()) + UnwindDestToken = RetUnwindDest->getFirstNonPHI(); + else + UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext()); + break; + } + Value *ChildUnwindDestToken; + if (auto *Invoke = dyn_cast(U)) { + ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI(); + } else if (isa(U) || isa(U)) { + Instruction *ChildPad = cast(U); + auto Memo = MemoMap.find(ChildPad); + if (Memo == MemoMap.end()) { + // Haven't resolved this child yet; queue it and keep searching. + Worklist.push_back(ChildPad); + continue; + } + // We've checked this child, but still need to ignore it if it + // had no proof either way. + ChildUnwindDestToken = Memo->second; + if (!ChildUnwindDestToken) + continue; + } else { + // Not a relevant user of the cleanuppad + continue; + } + // In a well-formed program, the child/invoke must either unwind to + // an(other) child of the cleanup, or exit the cleanup. In the + // first case, continue searching. + if (isa(ChildUnwindDestToken) && + getParentPad(ChildUnwindDestToken) == CleanupPad) + continue; + UnwindDestToken = ChildUnwindDestToken; + break; + } + } + // If we haven't found an unwind dest for CurrentPad, we may have queued its + // children, so move on to the next in the worklist. + if (!UnwindDestToken) + continue; + + // Now we know that CurrentPad unwinds to UnwindDestToken. It also exits + // any ancestors of CurrentPad up to but not including UnwindDestToken's + // parent pad. Record this in the memo map, and check to see if the + // original EHPad being queried is one of the ones exited. + Value *UnwindParent; + if (auto *UnwindPad = dyn_cast(UnwindDestToken)) + UnwindParent = getParentPad(UnwindPad); + else + UnwindParent = nullptr; + bool ExitedOriginalPad = false; + for (Instruction *ExitedPad = CurrentPad; + ExitedPad && ExitedPad != UnwindParent; + ExitedPad = dyn_cast(getParentPad(ExitedPad))) { + // Skip over catchpads since they just follow their catchswitches. + if (isa(ExitedPad)) + continue; + MemoMap[ExitedPad] = UnwindDestToken; + ExitedOriginalPad |= (ExitedPad == EHPad); + } + + if (ExitedOriginalPad) + return UnwindDestToken; + + // Continue the search. + } + + // No definitive information is contained within this funclet. + return nullptr; +} + +/// Given an EH pad, find where it unwinds. If it unwinds to an EH pad, +/// return that pad instruction. If it unwinds to caller, return +/// ConstantTokenNone. If it does not have a definitive unwind destination, +/// return nullptr. +/// +/// This routine gets invoked for calls in funclets in inlinees when inlining +/// an invoke. Since many funclets don't have calls inside them, it's queried +/// on-demand rather than building a map of pads to unwind dests up front. +/// Determining a funclet's unwind dest may require recursively searching its +/// descendants, and also ancestors and cousins if the descendants don't provide +/// an answer. Since most funclets will have their unwind dest immediately +/// available as the unwind dest of a catchswitch or cleanupret, this routine +/// searches top-down from the given pad and then up. To avoid worst-case +/// quadratic run-time given that approach, it uses a memo map to avoid +/// re-processing funclet trees. The callers that rewrite the IR as they go +/// take advantage of this, for correctness, by checking/forcing rewritten +/// pads' entries to match the original callee view. +static Value *getUnwindDestToken(Instruction *EHPad, + UnwindDestMemoTy &MemoMap) { + // Catchpads unwind to the same place as their catchswitch; + // redirct any queries on catchpads so the code below can + // deal with just catchswitches and cleanuppads. + if (auto *CPI = dyn_cast(EHPad)) + EHPad = CPI->getCatchSwitch(); + + // Check if we've already determined the unwind dest for this pad. + auto Memo = MemoMap.find(EHPad); + if (Memo != MemoMap.end()) + return Memo->second; + + // Search EHPad and, if necessary, its descendants. + Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap); + assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0)); + if (UnwindDestToken) + return UnwindDestToken; + + // No information is available for this EHPad from itself or any of its + // descendants. An unwind all the way out to a pad in the caller would + // need also to agree with the unwind dest of the parent funclet, so + // search up the chain to try to find a funclet with information. Put + // null entries in the memo map to avoid re-processing as we go up. + MemoMap[EHPad] = nullptr; + Instruction *LastUselessPad = EHPad; + Value *AncestorToken; + for (AncestorToken = getParentPad(EHPad); + auto *AncestorPad = dyn_cast(AncestorToken); + AncestorToken = getParentPad(AncestorToken)) { + // Skip over catchpads since they just follow their catchswitches. + if (isa(AncestorPad)) + continue; + assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]); + auto AncestorMemo = MemoMap.find(AncestorPad); + if (AncestorMemo == MemoMap.end()) { + UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap); + } else { + UnwindDestToken = AncestorMemo->second; + } + if (UnwindDestToken) + break; + LastUselessPad = AncestorPad; + } + + // Since the whole tree under LastUselessPad has no information, it all must + // match UnwindDestToken; record that to avoid repeating the search. + SmallVector Worklist(1, LastUselessPad); + while (!Worklist.empty()) { + Instruction *UselessPad = Worklist.pop_back_val(); + assert(!MemoMap.count(UselessPad) || MemoMap[UselessPad] == nullptr); + MemoMap[UselessPad] = UnwindDestToken; + if (auto *CatchSwitch = dyn_cast(UselessPad)) { + for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) + for (User *U : HandlerBlock->getFirstNonPHI()->users()) + if (isa(U) || isa(U)) + Worklist.push_back(cast(U)); + } else { + assert(isa(UselessPad)); + for (User *U : UselessPad->users()) + if (isa(U) || isa(U)) + Worklist.push_back(cast(U)); + } + } + + return UnwindDestToken; +} + /// When we inline a basic block into an invoke, /// we have to turn all of the calls that can throw into invokes. /// This function analyze BB to see if there are any calls, and if so, /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. -static BasicBlock * -HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) { +static BasicBlock *HandleCallsInBlockInlinedThroughInvoke( + BasicBlock *BB, BasicBlock *UnwindEdge, + UnwindDestMemoTy *FuncletUnwindMap = nullptr) { for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { Instruction *I = &*BBI++; // We only need to check for function calls: inlined invoke // instructions require no special handling. CallInst *CI = dyn_cast(I); if (!CI || CI->doesNotThrow() || isa(CI->getCalledValue())) continue; + if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) { + // This call is nested inside a funclet. If that funclet has an unwind + // destination within the inlinee, then unwinding out of this call would + // be UB. Rewriting this call to an invoke which targets the inlined + // invoke's unwind dest would give the call's parent funclet multiple + // unwind destinations, which is something that subsequent EH table + // generation can't handle and that the veirifer rejects. So when we + // see such a call, leave it as a call. + auto *FuncletPad = cast(FuncletBundle->Inputs[0]); + Value *UnwindDestToken = + getUnwindDestToken(FuncletPad, *FuncletUnwindMap); + if (UnwindDestToken && !isa(UnwindDestToken)) + continue; +#ifndef NDEBUG + Instruction *MemoKey; + if (auto *CatchPad = dyn_cast(FuncletPad)) + MemoKey = CatchPad->getCatchSwitch(); + else + MemoKey = FuncletPad; + assert(FuncletUnwindMap->count(MemoKey) && + (*FuncletUnwindMap)[MemoKey] == UnwindDestToken && + "must get memoized to avoid confusing later searches"); +#endif // NDEBUG + } + // Convert this function call into an invoke instruction. First, split the // basic block. BasicBlock *Split = BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc"); // Delete the unconditional branch inserted by splitBasicBlock BB->getInstList().pop_back(); // Create the new invoke instruction. SmallVector InvokeArgs(CI->arg_begin(), CI->arg_end()); SmallVector OpBundles; CI->getOperandBundlesAsDefs(OpBundles); // Note: we're round tripping operand bundles through memory here, and that // can potentially be avoided with a cleverer API design that we do not have // as of this time. InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB); II->setDebugLoc(CI->getDebugLoc()); II->setCallingConv(CI->getCallingConv()); II->setAttributes(CI->getAttributes()); // Make sure that anything using the call now uses the invoke! This also // updates the CallGraph if present, because it uses a WeakVH. CI->replaceAllUsesWith(II); // Delete the original call Split->getInstList().pop_front(); return BB; } return nullptr; } /// If we inlined an invoke site, we need to convert calls /// in the body of the inlined function into invokes. /// /// II is the invoke instruction being inlined. FirstNewBlock is the first /// block of the inlined code (the last block is the end of the function), /// and InlineCodeInfo is information about the code that got inlined. static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, ClonedCodeInfo &InlinedCodeInfo) { BasicBlock *InvokeDest = II->getUnwindDest(); Function *Caller = FirstNewBlock->getParent(); // The inlined code is currently at the end of the function, scan from the // start of the inlined code to its end, checking for stuff we need to // rewrite. LandingPadInliningInfo Invoke(II); // Get all of the inlined landing pad instructions. SmallPtrSet InlinedLPads; for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end(); I != E; ++I) if (InvokeInst *II = dyn_cast(I->getTerminator())) InlinedLPads.insert(II->getLandingPadInst()); // Append the clauses from the outer landing pad instruction into the inlined // landing pad instructions. LandingPadInst *OuterLPad = Invoke.getLandingPadInst(); for (LandingPadInst *InlinedLPad : InlinedLPads) { unsigned OuterNum = OuterLPad->getNumClauses(); InlinedLPad->reserveClauses(OuterNum); for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx) InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); if (OuterLPad->isCleanup()) InlinedLPad->setCleanup(true); } for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { if (InlinedCodeInfo.ContainsCalls) if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( &*BB, Invoke.getOuterResumeDest())) // Update any PHI nodes in the exceptional block to indicate that there // is now a new entry in them. Invoke.addIncomingPHIValuesFor(NewBB); // Forward any resumes that are remaining here. if (ResumeInst *RI = dyn_cast(BB->getTerminator())) Invoke.forwardResume(RI, InlinedLPads); } // Now that everything is happy, we have one final detail. The PHI nodes in // the exception destination block still have entries due to the original // invoke instruction. Eliminate these entries (which might even delete the // PHI node) now. InvokeDest->removePredecessor(II->getParent()); } /// If we inlined an invoke site, we need to convert calls /// in the body of the inlined function into invokes. /// /// II is the invoke instruction being inlined. FirstNewBlock is the first /// block of the inlined code (the last block is the end of the function), /// and InlineCodeInfo is information about the code that got inlined. static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, ClonedCodeInfo &InlinedCodeInfo) { BasicBlock *UnwindDest = II->getUnwindDest(); Function *Caller = FirstNewBlock->getParent(); assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!"); // If there are PHI nodes in the unwind destination block, we need to keep // track of which values came into them from the invoke before removing the // edge from this block. SmallVector UnwindDestPHIValues; llvm::BasicBlock *InvokeBB = II->getParent(); for (Instruction &I : *UnwindDest) { // Save the value to use for this edge. PHINode *PHI = dyn_cast(&I); if (!PHI) break; UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); } // Add incoming-PHI values to the unwind destination block for the given basic // block, using the values for the original invoke's source block. auto UpdatePHINodes = [&](BasicBlock *Src) { BasicBlock::iterator I = UnwindDest->begin(); for (Value *V : UnwindDestPHIValues) { PHINode *PHI = cast(I); PHI->addIncoming(V, Src); ++I; } }; // This connects all the instructions which 'unwind to caller' to the invoke // destination. + UnwindDestMemoTy FuncletUnwindMap; for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { if (auto *CRI = dyn_cast(BB->getTerminator())) { if (CRI->unwindsToCaller()) { - CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI); + auto *CleanupPad = CRI->getCleanupPad(); + CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI); CRI->eraseFromParent(); UpdatePHINodes(&*BB); + // Finding a cleanupret with an unwind destination would confuse + // subsequent calls to getUnwindDestToken, so map the cleanuppad + // to short-circuit any such calls and recognize this as an "unwind + // to caller" cleanup. + assert(!FuncletUnwindMap.count(CleanupPad) || + isa(FuncletUnwindMap[CleanupPad])); + FuncletUnwindMap[CleanupPad] = + ConstantTokenNone::get(Caller->getContext()); } } Instruction *I = BB->getFirstNonPHI(); if (!I->isEHPad()) continue; Instruction *Replacement = nullptr; if (auto *CatchSwitch = dyn_cast(I)) { if (CatchSwitch->unwindsToCaller()) { + Value *UnwindDestToken; + if (auto *ParentPad = + dyn_cast(CatchSwitch->getParentPad())) { + // This catchswitch is nested inside another funclet. If that + // funclet has an unwind destination within the inlinee, then + // unwinding out of this catchswitch would be UB. Rewriting this + // catchswitch to unwind to the inlined invoke's unwind dest would + // give the parent funclet multiple unwind destinations, which is + // something that subsequent EH table generation can't handle and + // that the veirifer rejects. So when we see such a call, leave it + // as "unwind to caller". + UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap); + if (UnwindDestToken && !isa(UnwindDestToken)) + continue; + } else { + // This catchswitch has no parent to inherit constraints from, and + // none of its descendants can have an unwind edge that exits it and + // targets another funclet in the inlinee. It may or may not have a + // descendant that definitively has an unwind to caller. In either + // case, we'll have to assume that any unwinds out of it may need to + // be routed to the caller, so treat it as though it has a definitive + // unwind to caller. + UnwindDestToken = ConstantTokenNone::get(Caller->getContext()); + } auto *NewCatchSwitch = CatchSwitchInst::Create( CatchSwitch->getParentPad(), UnwindDest, CatchSwitch->getNumHandlers(), CatchSwitch->getName(), CatchSwitch); for (BasicBlock *PadBB : CatchSwitch->handlers()) NewCatchSwitch->addHandler(PadBB); + // Propagate info for the old catchswitch over to the new one in + // the unwind map. This also serves to short-circuit any subsequent + // checks for the unwind dest of this catchswitch, which would get + // confused if they found the outer handler in the callee. + FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken; Replacement = NewCatchSwitch; } } else if (!isa(I)) { llvm_unreachable("unexpected EHPad!"); } if (Replacement) { Replacement->takeName(I); I->replaceAllUsesWith(Replacement); I->eraseFromParent(); UpdatePHINodes(&*BB); } } if (InlinedCodeInfo.ContainsCalls) for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) - if (BasicBlock *NewBB = - HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest)) + if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( + &*BB, UnwindDest, &FuncletUnwindMap)) // Update any PHI nodes in the exceptional block to indicate that there // is now a new entry in them. UpdatePHINodes(NewBB); // Now that everything is happy, we have one final detail. The PHI nodes in // the exception destination block still have entries due to the original // invoke instruction. Eliminate these entries (which might even delete the // PHI node) now. UnwindDest->removePredecessor(InvokeBB); } /// When inlining a function that contains noalias scope metadata, /// this metadata needs to be cloned so that the inlined blocks /// have different "unqiue scopes" at every call site. Were this not done, then /// aliasing scopes from a function inlined into a caller multiple times could /// not be differentiated (and this would lead to miscompiles because the /// non-aliasing property communicated by the metadata could have /// call-site-specific control dependencies). static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { const Function *CalledFunc = CS.getCalledFunction(); SetVector MD; // Note: We could only clone the metadata if it is already used in the // caller. I'm omitting that check here because it might confuse // inter-procedural alias analysis passes. We can revisit this if it becomes // an efficiency or overhead problem. for (Function::const_iterator I = CalledFunc->begin(), IE = CalledFunc->end(); I != IE; ++I) for (BasicBlock::const_iterator J = I->begin(), JE = I->end(); J != JE; ++J) { if (const MDNode *M = J->getMetadata(LLVMContext::MD_alias_scope)) MD.insert(M); if (const MDNode *M = J->getMetadata(LLVMContext::MD_noalias)) MD.insert(M); } if (MD.empty()) return; // Walk the existing metadata, adding the complete (perhaps cyclic) chain to // the set. SmallVector Queue(MD.begin(), MD.end()); while (!Queue.empty()) { const MDNode *M = cast(Queue.pop_back_val()); for (unsigned i = 0, ie = M->getNumOperands(); i != ie; ++i) if (const MDNode *M1 = dyn_cast(M->getOperand(i))) if (MD.insert(M1)) Queue.push_back(M1); } // Now we have a complete set of all metadata in the chains used to specify // the noalias scopes and the lists of those scopes. SmallVector DummyNodes; DenseMap MDMap; for (SetVector::iterator I = MD.begin(), IE = MD.end(); I != IE; ++I) { DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None)); MDMap[*I].reset(DummyNodes.back().get()); } // Create new metadata nodes to replace the dummy nodes, replacing old // metadata references with either a dummy node or an already-created new // node. for (SetVector::iterator I = MD.begin(), IE = MD.end(); I != IE; ++I) { SmallVector NewOps; for (unsigned i = 0, ie = (*I)->getNumOperands(); i != ie; ++i) { const Metadata *V = (*I)->getOperand(i); if (const MDNode *M = dyn_cast(V)) NewOps.push_back(MDMap[M]); else NewOps.push_back(const_cast(V)); } MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps); MDTuple *TempM = cast(MDMap[*I]); assert(TempM->isTemporary() && "Expected temporary node"); TempM->replaceAllUsesWith(NewM); } // Now replace the metadata in the new inlined instructions with the // repacements from the map. for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end(); VMI != VMIE; ++VMI) { if (!VMI->second) continue; Instruction *NI = dyn_cast(VMI->second); if (!NI) continue; if (MDNode *M = NI->getMetadata(LLVMContext::MD_alias_scope)) { MDNode *NewMD = MDMap[M]; // If the call site also had alias scope metadata (a list of scopes to // which instructions inside it might belong), propagate those scopes to // the inlined instructions. if (MDNode *CSM = CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope)) NewMD = MDNode::concatenate(NewMD, CSM); NI->setMetadata(LLVMContext::MD_alias_scope, NewMD); } else if (NI->mayReadOrWriteMemory()) { if (MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope)) NI->setMetadata(LLVMContext::MD_alias_scope, M); } if (MDNode *M = NI->getMetadata(LLVMContext::MD_noalias)) { MDNode *NewMD = MDMap[M]; // If the call site also had noalias metadata (a list of scopes with // which instructions inside it don't alias), propagate those scopes to // the inlined instructions. if (MDNode *CSM = CS.getInstruction()->getMetadata(LLVMContext::MD_noalias)) NewMD = MDNode::concatenate(NewMD, CSM); NI->setMetadata(LLVMContext::MD_noalias, NewMD); } else if (NI->mayReadOrWriteMemory()) { if (MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_noalias)) NI->setMetadata(LLVMContext::MD_noalias, M); } } } /// If the inlined function has noalias arguments, /// then add new alias scopes for each noalias argument, tag the mapped noalias /// parameters with noalias metadata specifying the new scope, and tag all /// non-derived loads, stores and memory intrinsics with the new alias scopes. static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, const DataLayout &DL, AAResults *CalleeAAR) { if (!EnableNoAliasConversion) return; const Function *CalledFunc = CS.getCalledFunction(); SmallVector NoAliasArgs; for (const Argument &I : CalledFunc->args()) { if (I.hasNoAliasAttr() && !I.hasNUses(0)) NoAliasArgs.push_back(&I); } if (NoAliasArgs.empty()) return; // To do a good job, if a noalias variable is captured, we need to know if // the capture point dominates the particular use we're considering. DominatorTree DT; DT.recalculate(const_cast(*CalledFunc)); // noalias indicates that pointer values based on the argument do not alias // pointer values which are not based on it. So we add a new "scope" for each // noalias function argument. Accesses using pointers based on that argument // become part of that alias scope, accesses using pointers not based on that // argument are tagged as noalias with that scope. DenseMap NewScopes; MDBuilder MDB(CalledFunc->getContext()); // Create a new scope domain for this function. MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(CalledFunc->getName()); for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) { const Argument *A = NoAliasArgs[i]; std::string Name = CalledFunc->getName(); if (A->hasName()) { Name += ": %"; Name += A->getName(); } else { Name += ": argument "; Name += utostr(i); } // Note: We always create a new anonymous root here. This is true regardless // of the linkage of the callee because the aliasing "scope" is not just a // property of the callee, but also all control dependencies in the caller. MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); NewScopes.insert(std::make_pair(A, NewScope)); } // Iterate over all new instructions in the map; for all memory-access // instructions, add the alias scope metadata. for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end(); VMI != VMIE; ++VMI) { if (const Instruction *I = dyn_cast(VMI->first)) { if (!VMI->second) continue; Instruction *NI = dyn_cast(VMI->second); if (!NI) continue; bool IsArgMemOnlyCall = false, IsFuncCall = false; SmallVector PtrArgs; if (const LoadInst *LI = dyn_cast(I)) PtrArgs.push_back(LI->getPointerOperand()); else if (const StoreInst *SI = dyn_cast(I)) PtrArgs.push_back(SI->getPointerOperand()); else if (const VAArgInst *VAAI = dyn_cast(I)) PtrArgs.push_back(VAAI->getPointerOperand()); else if (const AtomicCmpXchgInst *CXI = dyn_cast(I)) PtrArgs.push_back(CXI->getPointerOperand()); else if (const AtomicRMWInst *RMWI = dyn_cast(I)) PtrArgs.push_back(RMWI->getPointerOperand()); else if (ImmutableCallSite ICS = ImmutableCallSite(I)) { // If we know that the call does not access memory, then we'll still // know that about the inlined clone of this call site, and we don't // need to add metadata. if (ICS.doesNotAccessMemory()) continue; IsFuncCall = true; if (CalleeAAR) { FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(ICS); if (MRB == FMRB_OnlyAccessesArgumentPointees || MRB == FMRB_OnlyReadsArgumentPointees) IsArgMemOnlyCall = true; } for (ImmutableCallSite::arg_iterator AI = ICS.arg_begin(), AE = ICS.arg_end(); AI != AE; ++AI) { // We need to check the underlying objects of all arguments, not just // the pointer arguments, because we might be passing pointers as // integers, etc. // However, if we know that the call only accesses pointer arguments, // then we only need to check the pointer arguments. if (IsArgMemOnlyCall && !(*AI)->getType()->isPointerTy()) continue; PtrArgs.push_back(*AI); } } // If we found no pointers, then this instruction is not suitable for // pairing with an instruction to receive aliasing metadata. // However, if this is a call, this we might just alias with none of the // noalias arguments. if (PtrArgs.empty() && !IsFuncCall) continue; // It is possible that there is only one underlying object, but you // need to go through several PHIs to see it, and thus could be // repeated in the Objects list. SmallPtrSet ObjSet; SmallVector Scopes, NoAliases; SmallSetVector NAPtrArgs; for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) { SmallVector Objects; GetUnderlyingObjects(const_cast(PtrArgs[i]), Objects, DL, /* LI = */ nullptr); for (Value *O : Objects) ObjSet.insert(O); } // Figure out if we're derived from anything that is not a noalias // argument. bool CanDeriveViaCapture = false, UsesAliasingPtr = false; for (const Value *V : ObjSet) { // Is this value a constant that cannot be derived from any pointer // value (we need to exclude constant expressions, for example, that // are formed from arithmetic on global symbols). bool IsNonPtrConst = isa(V) || isa(V) || isa(V) || isa(V) || isa(V); if (IsNonPtrConst) continue; // If this is anything other than a noalias argument, then we cannot // completely describe the aliasing properties using alias.scope // metadata (and, thus, won't add any). if (const Argument *A = dyn_cast(V)) { if (!A->hasNoAliasAttr()) UsesAliasingPtr = true; } else { UsesAliasingPtr = true; } // If this is not some identified function-local object (which cannot // directly alias a noalias argument), or some other argument (which, // by definition, also cannot alias a noalias argument), then we could // alias a noalias argument that has been captured). if (!isa(V) && !isIdentifiedFunctionLocal(const_cast(V))) CanDeriveViaCapture = true; } // A function call can always get captured noalias pointers (via other // parameters, globals, etc.). if (IsFuncCall && !IsArgMemOnlyCall) CanDeriveViaCapture = true; // First, we want to figure out all of the sets with which we definitely // don't alias. Iterate over all noalias set, and add those for which: // 1. The noalias argument is not in the set of objects from which we // definitely derive. // 2. The noalias argument has not yet been captured. // An arbitrary function that might load pointers could see captured // noalias arguments via other noalias arguments or globals, and so we // must always check for prior capture. for (const Argument *A : NoAliasArgs) { if (!ObjSet.count(A) && (!CanDeriveViaCapture || // It might be tempting to skip the // PointerMayBeCapturedBefore check if // A->hasNoCaptureAttr() is true, but this is // incorrect because nocapture only guarantees // that no copies outlive the function, not // that the value cannot be locally captured. !PointerMayBeCapturedBefore(A, /* ReturnCaptures */ false, /* StoreCaptures */ false, I, &DT))) NoAliases.push_back(NewScopes[A]); } if (!NoAliases.empty()) NI->setMetadata(LLVMContext::MD_noalias, MDNode::concatenate( NI->getMetadata(LLVMContext::MD_noalias), MDNode::get(CalledFunc->getContext(), NoAliases))); // Next, we want to figure out all of the sets to which we might belong. // We might belong to a set if the noalias argument is in the set of // underlying objects. If there is some non-noalias argument in our list // of underlying objects, then we cannot add a scope because the fact // that some access does not alias with any set of our noalias arguments // cannot itself guarantee that it does not alias with this access // (because there is some pointer of unknown origin involved and the // other access might also depend on this pointer). We also cannot add // scopes to arbitrary functions unless we know they don't access any // non-parameter pointer-values. bool CanAddScopes = !UsesAliasingPtr; if (CanAddScopes && IsFuncCall) CanAddScopes = IsArgMemOnlyCall; if (CanAddScopes) for (const Argument *A : NoAliasArgs) { if (ObjSet.count(A)) Scopes.push_back(NewScopes[A]); } if (!Scopes.empty()) NI->setMetadata( LLVMContext::MD_alias_scope, MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope), MDNode::get(CalledFunc->getContext(), Scopes))); } } } /// If the inlined function has non-byval align arguments, then /// add @llvm.assume-based alignment assumptions to preserve this information. static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { if (!PreserveAlignmentAssumptions) return; auto &DL = CS.getCaller()->getParent()->getDataLayout(); // To avoid inserting redundant assumptions, we should check for assumptions // already in the caller. To do this, we might need a DT of the caller. DominatorTree DT; bool DTCalculated = false; Function *CalledFunc = CS.getCalledFunction(); for (Function::arg_iterator I = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); I != E; ++I) { unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0; if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) { if (!DTCalculated) { DT.recalculate(const_cast(*CS.getInstruction()->getParent() ->getParent())); DTCalculated = true; } // If we can already prove the asserted alignment in the context of the // caller, then don't bother inserting the assumption. Value *Arg = CS.getArgument(I->getArgNo()); if (getKnownAlignment(Arg, DL, CS.getInstruction(), &IFI.ACT->getAssumptionCache(*CS.getCaller()), &DT) >= Align) continue; IRBuilder<>(CS.getInstruction()) .CreateAlignmentAssumption(DL, Arg, Align); } } } /// Once we have cloned code over from a callee into the caller, /// update the specified callgraph to reflect the changes we made. /// Note that it's possible that not all code was copied over, so only /// some edges of the callgraph may remain. static void UpdateCallGraphAfterInlining(CallSite CS, Function::iterator FirstNewBlock, ValueToValueMapTy &VMap, InlineFunctionInfo &IFI) { CallGraph &CG = *IFI.CG; const Function *Caller = CS.getInstruction()->getParent()->getParent(); const Function *Callee = CS.getCalledFunction(); CallGraphNode *CalleeNode = CG[Callee]; CallGraphNode *CallerNode = CG[Caller]; // Since we inlined some uninlined call sites in the callee into the caller, // add edges from the caller to all of the callees of the callee. CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end(); // Consider the case where CalleeNode == CallerNode. CallGraphNode::CalledFunctionsVector CallCache; if (CalleeNode == CallerNode) { CallCache.assign(I, E); I = CallCache.begin(); E = CallCache.end(); } for (; I != E; ++I) { const Value *OrigCall = I->first; ValueToValueMapTy::iterator VMI = VMap.find(OrigCall); // Only copy the edge if the call was inlined! if (VMI == VMap.end() || VMI->second == nullptr) continue; // If the call was inlined, but then constant folded, there is no edge to // add. Check for this case. Instruction *NewCall = dyn_cast(VMI->second); if (!NewCall) continue; // We do not treat intrinsic calls like real function calls because we // expect them to become inline code; do not add an edge for an intrinsic. CallSite CS = CallSite(NewCall); if (CS && CS.getCalledFunction() && CS.getCalledFunction()->isIntrinsic()) continue; // Remember that this call site got inlined for the client of // InlineFunction. IFI.InlinedCalls.push_back(NewCall); // It's possible that inlining the callsite will cause it to go from an // indirect to a direct call by resolving a function pointer. If this // happens, set the callee of the new call site to a more precise // destination. This can also happen if the call graph node of the caller // was just unnecessarily imprecise. if (!I->second->getFunction()) if (Function *F = CallSite(NewCall).getCalledFunction()) { // Indirect call site resolved to direct call. CallerNode->addCalledFunction(CallSite(NewCall), CG[F]); continue; } CallerNode->addCalledFunction(CallSite(NewCall), I->second); } // Update the call graph by deleting the edge from Callee to Caller. We must // do this after the loop above in case Caller and Callee are the same. CallerNode->removeCallEdgeFor(CS); } static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, BasicBlock *InsertBlock, InlineFunctionInfo &IFI) { Type *AggTy = cast(Src->getType())->getElementType(); IRBuilder<> Builder(InsertBlock, InsertBlock->begin()); Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy)); // Always generate a memcpy of alignment 1 here because we don't know // the alignment of the src pointer. Other optimizations can infer // better alignment. Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1); } /// When inlining a call site that has a byval argument, /// we have to make the implicit memcpy explicit by adding it. static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, const Function *CalledFunc, InlineFunctionInfo &IFI, unsigned ByValAlignment) { PointerType *ArgTy = cast(Arg->getType()); Type *AggTy = ArgTy->getElementType(); Function *Caller = TheCall->getParent()->getParent(); // If the called function is readonly, then it could not mutate the caller's // copy of the byval'd memory. In this case, it is safe to elide the copy and // temporary. if (CalledFunc->onlyReadsMemory()) { // If the byval argument has a specified alignment that is greater than the // passed in pointer, then we either have to round up the input pointer or // give up on this transformation. if (ByValAlignment <= 1) // 0 = unspecified, 1 = no particular alignment. return Arg; const DataLayout &DL = Caller->getParent()->getDataLayout(); // If the pointer is already known to be sufficiently aligned, or if we can // round it up to a larger alignment, then we don't need a temporary. if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, &IFI.ACT->getAssumptionCache(*Caller)) >= ByValAlignment) return Arg; // Otherwise, we have to make a memcpy to get a safe alignment. This is bad // for code quality, but rarely happens and is required for correctness. } // Create the alloca. If we have DataLayout, use nice alignment. unsigned Align = Caller->getParent()->getDataLayout().getPrefTypeAlignment(AggTy); // If the byval had an alignment specified, we *must* use at least that // alignment, as it is required by the byval argument (and uses of the // pointer inside the callee). Align = std::max(Align, ByValAlignment); Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), &*Caller->begin()->begin()); IFI.StaticAllocas.push_back(cast(NewAlloca)); // Uses of the argument in the function should use our new alloca // instead. return NewAlloca; } // Check whether this Value is used by a lifetime intrinsic. static bool isUsedByLifetimeMarker(Value *V) { for (User *U : V->users()) { if (IntrinsicInst *II = dyn_cast(U)) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: return true; } } } return false; } // Check whether the given alloca already has // lifetime.start or lifetime.end intrinsics. static bool hasLifetimeMarkers(AllocaInst *AI) { Type *Ty = AI->getType(); Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(), Ty->getPointerAddressSpace()); if (Ty == Int8PtrTy) return isUsedByLifetimeMarker(AI); // Do a scan to find all the casts to i8*. for (User *U : AI->users()) { if (U->getType() != Int8PtrTy) continue; if (U->stripPointerCasts() != AI) continue; if (isUsedByLifetimeMarker(U)) return true; } return false; } /// Rebuild the entire inlined-at chain for this instruction so that the top of /// the chain now is inlined-at the new call site. static DebugLoc updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx, DenseMap &IANodes) { SmallVector InlinedAtLocations; DILocation *Last = InlinedAtNode; DILocation *CurInlinedAt = DL; // Gather all the inlined-at nodes while (DILocation *IA = CurInlinedAt->getInlinedAt()) { // Skip any we've already built nodes for if (DILocation *Found = IANodes[IA]) { Last = Found; break; } InlinedAtLocations.push_back(IA); CurInlinedAt = IA; } // Starting from the top, rebuild the nodes to point to the new inlined-at // location (then rebuilding the rest of the chain behind it) and update the // map of already-constructed inlined-at nodes. for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(), InlinedAtLocations.rend())) { Last = IANodes[MD] = DILocation::getDistinct( Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); } // And finally create the normal location for this instruction, referring to // the new inlined-at chain. return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); } /// Update inlined instructions' line numbers to /// to encode location where these instructions are inlined. static void fixupLineNumbers(Function *Fn, Function::iterator FI, Instruction *TheCall) { DebugLoc TheCallDL = TheCall->getDebugLoc(); if (!TheCallDL) return; auto &Ctx = Fn->getContext(); DILocation *InlinedAtNode = TheCallDL; // Create a unique call site, not to be confused with any other call from the // same location. InlinedAtNode = DILocation::getDistinct( Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(), InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt()); // Cache the inlined-at nodes as they're built so they are reused, without // this every instruction's inlined-at chain would become distinct from each // other. DenseMap IANodes; for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { DebugLoc DL = BI->getDebugLoc(); if (!DL) { // If the inlined instruction has no line number, make it look as if it // originates from the call location. This is important for // ((__always_inline__, __nodebug__)) functions which must use caller // location for all instructions in their function body. // Don't update static allocas, as they may get moved later. if (auto *AI = dyn_cast(BI)) if (isa(AI->getArraySize())) continue; BI->setDebugLoc(TheCallDL); } else { BI->setDebugLoc(updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes)); } } } } /// This function inlines the called function into the basic block of the /// caller. This returns false if it is not possible to inline this call. /// The program is still in a well defined state if this occurs though. /// /// Note that this only does one level of inlining. For example, if the /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now /// exists in the instruction stream. Similarly this will inline a recursive /// function by one level. bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, AAResults *CalleeAAR, bool InsertLifetime) { Instruction *TheCall = CS.getInstruction(); assert(TheCall->getParent() && TheCall->getParent()->getParent() && "Instruction not in function!"); // If IFI has any state in it, zap it before we fill it in. IFI.reset(); const Function *CalledFunc = CS.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! CalledFunc->getFunctionType()->isVarArg()) return false; // The inliner does not know how to inline through calls with operand bundles // in general ... if (CS.hasOperandBundles()) { for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { uint32_t Tag = CS.getOperandBundleAt(i).getTagID(); // ... but it knows how to inline through "deopt" operand bundles ... if (Tag == LLVMContext::OB_deopt) continue; // ... and "funclet" operand bundles. if (Tag == LLVMContext::OB_funclet) continue; return false; } } // If the call to the callee cannot throw, set the 'nounwind' flag on any // calls that we inline. bool MarkNoUnwind = CS.doesNotThrow(); BasicBlock *OrigBB = TheCall->getParent(); Function *Caller = OrigBB->getParent(); // GC poses two hazards to inlining, which only occur when the callee has GC: // 1. If the caller has no GC, then the callee's GC must be propagated to the // caller. // 2. If the caller has a differing GC, it is invalid to inline. if (CalledFunc->hasGC()) { if (!Caller->hasGC()) Caller->setGC(CalledFunc->getGC()); else if (CalledFunc->getGC() != Caller->getGC()) return false; } // Get the personality function from the callee if it contains a landing pad. Constant *CalledPersonality = CalledFunc->hasPersonalityFn() ? CalledFunc->getPersonalityFn()->stripPointerCasts() : nullptr; // Find the personality function used by the landing pads of the caller. If it // exists, then check to see that it matches the personality function used in // the callee. Constant *CallerPersonality = Caller->hasPersonalityFn() ? Caller->getPersonalityFn()->stripPointerCasts() : nullptr; if (CalledPersonality) { if (!CallerPersonality) Caller->setPersonalityFn(CalledPersonality); // If the personality functions match, then we can perform the // inlining. Otherwise, we can't inline. // TODO: This isn't 100% true. Some personality functions are proper // supersets of others and can be used in place of the other. else if (CalledPersonality != CallerPersonality) return false; } // We need to figure out which funclet the callsite was in so that we may // properly nest the callee. Instruction *CallSiteEHPad = nullptr; if (CallerPersonality) { EHPersonality Personality = classifyEHPersonality(CallerPersonality); if (isFuncletEHPersonality(Personality)) { Optional ParentFunclet = CS.getOperandBundle(LLVMContext::OB_funclet); if (ParentFunclet) CallSiteEHPad = cast(ParentFunclet->Inputs.front()); // OK, the inlining site is legal. What about the target function? if (CallSiteEHPad) { if (Personality == EHPersonality::MSVC_CXX) { // The MSVC personality cannot tolerate catches getting inlined into // cleanup funclets. if (isa(CallSiteEHPad)) { // Ok, the call site is within a cleanuppad. Let's check the callee // for catchpads. for (const BasicBlock &CalledBB : *CalledFunc) { if (isa(CalledBB.getFirstNonPHI())) return false; } } } else if (isAsynchronousEHPersonality(Personality)) { // SEH is even less tolerant, there may not be any sort of exceptional // funclet in the callee. for (const BasicBlock &CalledBB : *CalledFunc) { if (CalledBB.isEHPad()) return false; } } } } } // Get an iterator to the last basic block in the function, which will have // the new function inlined after it. Function::iterator LastBlock = --Caller->end(); // Make sure to capture all of the return instructions from the cloned // function. SmallVector Returns; ClonedCodeInfo InlinedFunctionInfo; Function::iterator FirstNewBlock; { // Scope to destroy VMap after cloning. ValueToValueMapTy VMap; // Keep a list of pair (dst, src) to emit byval initializations. SmallVector, 4> ByValInit; auto &DL = Caller->getParent()->getDataLayout(); assert(CalledFunc->arg_size() == CS.arg_size() && "No varargs calls can be inlined!"); // Calculate the vector of arguments to pass into the function cloner, which // matches up the formal to the actual argument values. CallSite::arg_iterator AI = CS.arg_begin(); unsigned ArgNo = 0; for (Function::const_arg_iterator I = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) { Value *ActualArg = *AI; // When byval arguments actually inlined, we need to make the copy implied // by them explicit. However, we don't do this if the callee is readonly // or readnone, because the copy would be unneeded: the callee doesn't // modify the struct. if (CS.isByValArgument(ArgNo)) { ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI, CalledFunc->getParamAlignment(ArgNo+1)); if (ActualArg != *AI) ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI)); } VMap[&*I] = ActualArg; } // Add alignment assumptions if necessary. We do this before the inlined // instructions are actually cloned into the caller so that we can easily // check what will be known at the start of the inlined code. AddAlignmentAssumptions(CS, IFI); // We want the inliner to prune the code as it copies. We would LOVE to // have no dead or constant instructions leftover after inlining occurs // (which can happen, e.g., because an argument was constant), but we'll be // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", &InlinedFunctionInfo, TheCall); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; // Inject byval arguments initialization. for (std::pair &Init : ByValInit) HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), &*FirstNewBlock, IFI); Optional ParentDeopt = CS.getOperandBundle(LLVMContext::OB_deopt); if (ParentDeopt) { SmallVector OpDefs; for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) { Instruction *I = dyn_cast_or_null(VH); if (!I) continue; // instruction was DCE'd or RAUW'ed to undef OpDefs.clear(); CallSite ICS(I); OpDefs.reserve(ICS.getNumOperandBundles()); for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) { auto ChildOB = ICS.getOperandBundleAt(i); if (ChildOB.getTagID() != LLVMContext::OB_deopt) { // If the inlined call has other operand bundles, let them be OpDefs.emplace_back(ChildOB); continue; } // It may be useful to separate this logic (of handling operand // bundles) out to a separate "policy" component if this gets crowded. // Prepend the parent's deoptimization continuation to the newly // inlined call's deoptimization continuation. std::vector MergedDeoptArgs; MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() + ChildOB.Inputs.size()); MergedDeoptArgs.insert(MergedDeoptArgs.end(), ParentDeopt->Inputs.begin(), ParentDeopt->Inputs.end()); MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(), ChildOB.Inputs.end()); OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs)); } Instruction *NewI = nullptr; if (isa(I)) NewI = CallInst::Create(cast(I), OpDefs, I); else NewI = InvokeInst::Create(cast(I), OpDefs, I); // Note: the RAUW does the appropriate fixup in VMap, so we need to do // this even if the call returns void. I->replaceAllUsesWith(NewI); VH = nullptr; I->eraseFromParent(); } } // Update the callgraph if requested. if (IFI.CG) UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI); // Update inlined instructions' line number information. fixupLineNumbers(Caller, FirstNewBlock, TheCall); // Clone existing noalias metadata if necessary. CloneAliasScopeMetadata(CS, VMap); // Add noalias metadata if necessary. AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. if (IFI.ACT) IFI.ACT->getAssumptionCache(*Caller).clear(); } // If there are any alloca instructions in the block that used to be the entry // block for the callee, move them to the entry block of the caller. First // calculate which instruction they should be inserted before. We insert the // instructions at the end of the current alloca list. { BasicBlock::iterator InsertPoint = Caller->begin()->begin(); for (BasicBlock::iterator I = FirstNewBlock->begin(), E = FirstNewBlock->end(); I != E; ) { AllocaInst *AI = dyn_cast(I++); if (!AI) continue; // If the alloca is now dead, remove it. This often occurs due to code // specialization. if (AI->use_empty()) { AI->eraseFromParent(); continue; } if (!isa(AI->getArraySize())) continue; // Keep track of the static allocas that we inline into the caller. IFI.StaticAllocas.push_back(AI); // Scan for the block of allocas that we can move over, and move them // all at once. while (isa(I) && isa(cast(I)->getArraySize())) { IFI.StaticAllocas.push_back(cast(I)); ++I; } // Transfer all of the allocas over in a block. Using splice means // that the instructions aren't removed from the symbol table, then // reinserted. Caller->getEntryBlock().getInstList().splice( InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); } // Move any dbg.declares describing the allocas into the entry basic block. DIBuilder DIB(*Caller->getParent()); for (auto &AI : IFI.StaticAllocas) replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false); } bool InlinedMustTailCalls = false; if (InlinedFunctionInfo.ContainsCalls) { CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None; if (CallInst *CI = dyn_cast(TheCall)) CallSiteTailKind = CI->getTailCallKind(); for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB) { for (Instruction &I : *BB) { CallInst *CI = dyn_cast(&I); if (!CI) continue; // We need to reduce the strength of any inlined tail calls. For // musttail, we have to avoid introducing potential unbounded stack // growth. For example, if functions 'f' and 'g' are mutually recursive // with musttail, we can inline 'g' into 'f' so long as we preserve // musttail on the cloned call to 'f'. If either the inlined call site // or the cloned call site is *not* musttail, the program already has // one frame of stack growth, so it's safe to remove musttail. Here is // a table of example transformations: // // f -> musttail g -> musttail f ==> f -> musttail f // f -> musttail g -> tail f ==> f -> tail f // f -> g -> musttail f ==> f -> f // f -> g -> tail f ==> f -> f CallInst::TailCallKind ChildTCK = CI->getTailCallKind(); ChildTCK = std::min(CallSiteTailKind, ChildTCK); CI->setTailCallKind(ChildTCK); InlinedMustTailCalls |= CI->isMustTailCall(); // Calls inlined through a 'nounwind' call site should be marked // 'nounwind'. if (MarkNoUnwind) CI->setDoesNotThrow(); } } } // Leave lifetime markers for the static alloca's, scoping them to the // function we just inlined. if (InsertLifetime && !IFI.StaticAllocas.empty()) { IRBuilder<> builder(&FirstNewBlock->front()); for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) { AllocaInst *AI = IFI.StaticAllocas[ai]; // If the alloca is already scoped to something smaller than the whole // function then there's no need to add redundant, less accurate markers. if (hasLifetimeMarkers(AI)) continue; // Try to determine the size of the allocation. ConstantInt *AllocaSize = nullptr; if (ConstantInt *AIArraySize = dyn_cast(AI->getArraySize())) { auto &DL = Caller->getParent()->getDataLayout(); Type *AllocaType = AI->getAllocatedType(); uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType); uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); // Don't add markers for zero-sized allocas. if (AllocaArraySize == 0) continue; // Check that array size doesn't saturate uint64_t and doesn't // overflow when it's multiplied by type size. if (AllocaArraySize != ~0ULL && UINT64_MAX / AllocaArraySize >= AllocaTypeSize) { AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), AllocaArraySize * AllocaTypeSize); } } builder.CreateLifetimeStart(AI, AllocaSize); for (ReturnInst *RI : Returns) { // Don't insert llvm.lifetime.end calls between a musttail call and a // return. The return kills all local allocas. if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall()) continue; IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize); } } } // If the inlined code contained dynamic alloca instructions, wrap the inlined // code with llvm.stacksave/llvm.stackrestore intrinsics. if (InlinedFunctionInfo.ContainsDynamicAllocas) { Module *M = Caller->getParent(); // Get the two intrinsics we care about. Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); // Insert the llvm.stacksave. CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin()) .CreateCall(StackSave, {}, "savedstack"); // Insert a call to llvm.stackrestore before any return instructions in the // inlined function. for (ReturnInst *RI : Returns) { // Don't insert llvm.stackrestore calls between a musttail call and a // return. The return will restore the stack pointer. if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall()) continue; IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr); } } + // If we are inlining for an invoke instruction, we must make sure to rewrite + // any call instructions into invoke instructions. This is sensitive to which + // funclet pads were top-level in the inlinee, so must be done before + // rewriting the "parent pad" links. + if (auto *II = dyn_cast(TheCall)) { + BasicBlock *UnwindDest = II->getUnwindDest(); + Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); + if (isa(FirstNonPHI)) { + HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } else { + HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } + } + // Update the lexical scopes of the new funclets and callsites. // Anything that had 'none' as its parent is now nested inside the callsite's // EHPad. if (CallSiteEHPad) { for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { // Add bundle operands to any top-level call sites. SmallVector OpBundles; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { Instruction *I = &*BBI++; CallSite CS(I); if (!CS) continue; // Skip call sites which are nounwind intrinsics. auto *CalledFn = dyn_cast(CS.getCalledValue()->stripPointerCasts()); if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow()) continue; // Skip call sites which already have a "funclet" bundle. if (CS.getOperandBundle(LLVMContext::OB_funclet)) continue; CS.getOperandBundlesAsDefs(OpBundles); OpBundles.emplace_back("funclet", CallSiteEHPad); Instruction *NewInst; if (CS.isCall()) NewInst = CallInst::Create(cast(I), OpBundles, I); else NewInst = InvokeInst::Create(cast(I), OpBundles, I); NewInst->setDebugLoc(I->getDebugLoc()); NewInst->takeName(I); I->replaceAllUsesWith(NewInst); I->eraseFromParent(); OpBundles.clear(); } Instruction *I = BB->getFirstNonPHI(); if (!I->isEHPad()) continue; if (auto *CatchSwitch = dyn_cast(I)) { if (isa(CatchSwitch->getParentPad())) CatchSwitch->setParentPad(CallSiteEHPad); } else { auto *FPI = cast(I); if (isa(FPI->getParentPad())) FPI->setParentPad(CallSiteEHPad); } - } - } - - // If we are inlining for an invoke instruction, we must make sure to rewrite - // any call instructions into invoke instructions. - if (auto *II = dyn_cast(TheCall)) { - BasicBlock *UnwindDest = II->getUnwindDest(); - Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); - if (isa(FirstNonPHI)) { - HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); - } else { - HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); } } // Handle any inlined musttail call sites. In order for a new call site to be // musttail, the source of the clone and the inlined call site must have been // musttail. Therefore it's safe to return without merging control into the // phi below. if (InlinedMustTailCalls) { // Check if we need to bitcast the result of any musttail calls. Type *NewRetTy = Caller->getReturnType(); bool NeedBitCast = !TheCall->use_empty() && TheCall->getType() != NewRetTy; // Handle the returns preceded by musttail calls separately. SmallVector NormalReturns; for (ReturnInst *RI : Returns) { CallInst *ReturnedMustTail = RI->getParent()->getTerminatingMustTailCall(); if (!ReturnedMustTail) { NormalReturns.push_back(RI); continue; } if (!NeedBitCast) continue; // Delete the old return and any preceding bitcast. BasicBlock *CurBB = RI->getParent(); auto *OldCast = dyn_cast_or_null(RI->getReturnValue()); RI->eraseFromParent(); if (OldCast) OldCast->eraseFromParent(); // Insert a new bitcast and return with the right type. IRBuilder<> Builder(CurBB); Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy)); } // Leave behind the normal returns so we can merge control flow. std::swap(Returns, NormalReturns); } // If we cloned in _exactly one_ basic block, and if that block ends in a // return instruction, we splice the body of the inlined callee directly into // the calling basic block. if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) { // Move all of the instructions right before the call. OrigBB->getInstList().splice(TheCall->getIterator(), FirstNewBlock->getInstList(), FirstNewBlock->begin(), FirstNewBlock->end()); // Remove the cloned basic block. Caller->getBasicBlockList().pop_back(); // If the call site was an invoke instruction, add a branch to the normal // destination. if (InvokeInst *II = dyn_cast(TheCall)) { BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall); NewBr->setDebugLoc(Returns[0]->getDebugLoc()); } // If the return instruction returned a value, replace uses of the call with // uses of the returned value. if (!TheCall->use_empty()) { ReturnInst *R = Returns[0]; if (TheCall == R->getReturnValue()) TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); else TheCall->replaceAllUsesWith(R->getReturnValue()); } // Since we are now done with the Call/Invoke, we can delete it. TheCall->eraseFromParent(); // Since we are now done with the return instruction, delete it also. Returns[0]->eraseFromParent(); // We are now done with the inlining. return true; } // Otherwise, we have the normal case, of more than one block to inline or // multiple return sites. // We want to clone the entire callee function into the hole between the // "starter" and "ender" blocks. How we accomplish this depends on whether // this is an invoke instruction or a call instruction. BasicBlock *AfterCallBB; BranchInst *CreatedBranchToNormalDest = nullptr; if (InvokeInst *II = dyn_cast(TheCall)) { // Add an unconditional branch to make this look like the CallInst case... CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), TheCall); // Split the basic block. This guarantees that no PHI nodes will have to be // updated due to new incoming edges, and make the invoke case more // symmetric to the call case. AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(), CalledFunc->getName() + ".exit"); } else { // It's a call // If this is a call instruction, we need to split the basic block that // the call lives in. // AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(), CalledFunc->getName() + ".exit"); } // Change the branch that used to go to AfterCallBB to branch to the first // basic block of the inlined function. // TerminatorInst *Br = OrigBB->getTerminator(); assert(Br && Br->getOpcode() == Instruction::Br && "splitBasicBlock broken!"); Br->setOperand(0, &*FirstNewBlock); // Now that the function is correct, make it a little bit nicer. In // particular, move the basic blocks inserted from the end of the function // into the space made by splitting the source basic block. Caller->getBasicBlockList().splice(AfterCallBB->getIterator(), Caller->getBasicBlockList(), FirstNewBlock, Caller->end()); // Handle all of the return instructions that we just cloned in, and eliminate // any users of the original call/invoke instruction. Type *RTy = CalledFunc->getReturnType(); PHINode *PHI = nullptr; if (Returns.size() > 1) { // The PHI node should go at the front of the new basic block to merge all // possible incoming values. if (!TheCall->use_empty()) { PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(), &AfterCallBB->front()); // Anything that used the result of the function call should now use the // PHI node as their operand. TheCall->replaceAllUsesWith(PHI); } // Loop over all of the return instructions adding entries to the PHI node // as appropriate. if (PHI) { for (unsigned i = 0, e = Returns.size(); i != e; ++i) { ReturnInst *RI = Returns[i]; assert(RI->getReturnValue()->getType() == PHI->getType() && "Ret value not consistent in function!"); PHI->addIncoming(RI->getReturnValue(), RI->getParent()); } } // Add a branch to the merge points and remove return instructions. DebugLoc Loc; for (unsigned i = 0, e = Returns.size(); i != e; ++i) { ReturnInst *RI = Returns[i]; BranchInst* BI = BranchInst::Create(AfterCallBB, RI); Loc = RI->getDebugLoc(); BI->setDebugLoc(Loc); RI->eraseFromParent(); } // We need to set the debug location to *somewhere* inside the // inlined function. The line number may be nonsensical, but the // instruction will at least be associated with the right // function. if (CreatedBranchToNormalDest) CreatedBranchToNormalDest->setDebugLoc(Loc); } else if (!Returns.empty()) { // Otherwise, if there is exactly one return value, just replace anything // using the return value of the call with the computed value. if (!TheCall->use_empty()) { if (TheCall == Returns[0]->getReturnValue()) TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); else TheCall->replaceAllUsesWith(Returns[0]->getReturnValue()); } // Update PHI nodes that use the ReturnBB to use the AfterCallBB. BasicBlock *ReturnBB = Returns[0]->getParent(); ReturnBB->replaceAllUsesWith(AfterCallBB); // Splice the code from the return block into the block that it will return // to, which contains the code that was after the call. AfterCallBB->getInstList().splice(AfterCallBB->begin(), ReturnBB->getInstList()); if (CreatedBranchToNormalDest) CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc()); // Delete the return instruction now and empty ReturnBB now. Returns[0]->eraseFromParent(); ReturnBB->eraseFromParent(); } else if (!TheCall->use_empty()) { // No returns, but something is using the return value of the call. Just // nuke the result. TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); } // Since we are now done with the Call/Invoke, we can delete it. TheCall->eraseFromParent(); // If we inlined any musttail calls and the original return is now // unreachable, delete it. It can only contain a bitcast and ret. if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB)) AfterCallBB->eraseFromParent(); // We should always be able to fold the entry block of the function into the // single predecessor of the block... assert(cast(Br)->isUnconditional() && "splitBasicBlock broken!"); BasicBlock *CalleeEntry = cast(Br)->getSuccessor(0); // Splice the code entry block into calling block, right before the // unconditional branch. CalleeEntry->replaceAllUsesWith(OrigBB); // Update PHI nodes OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList()); // Remove the unconditional branch. OrigBB->getInstList().erase(Br); // Now we can remove the CalleeEntry block, which is now empty. Caller->getBasicBlockList().erase(CalleeEntry); // If we inserted a phi node, check to see if it has a single value (e.g. all // the entries are the same or undef). If so, remove the PHI so it doesn't // block other optimizations. if (PHI) { auto &DL = Caller->getParent()->getDataLayout(); if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr, &IFI.ACT->getAssumptionCache(*Caller))) { PHI->replaceAllUsesWith(V); PHI->eraseFromParent(); } } return true; } Index: projects/clang380-import/contrib/llvm/lib/Transforms/Utils/Local.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Transforms/Utils/Local.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Transforms/Utils/Local.cpp (revision 294609) @@ -1,1594 +1,1796 @@ //===-- Local.cpp - Functions to perform local transformations ------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This family of functions perform various local transformations to the // program. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "local" STATISTIC(NumRemoved, "Number of unreachable basic blocks removed"); //===----------------------------------------------------------------------===// // Local constant propagation. // /// ConstantFoldTerminator - If a terminator instruction is predicated on a /// constant value, convert it into an unconditional branch to the constant /// destination. This is a nontrivial operation because the successors of this /// basic block must have their PHI nodes updated. /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch /// conditions and indirectbr addresses this might make dead if /// DeleteDeadConditions is true. bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, const TargetLibraryInfo *TLI) { TerminatorInst *T = BB->getTerminator(); IRBuilder<> Builder(T); // Branch - See if we are conditional jumping on constant if (BranchInst *BI = dyn_cast(T)) { if (BI->isUnconditional()) return false; // Can't optimize uncond branch BasicBlock *Dest1 = BI->getSuccessor(0); BasicBlock *Dest2 = BI->getSuccessor(1); if (ConstantInt *Cond = dyn_cast(BI->getCondition())) { // Are we branching on constant? // YES. Change to unconditional branch... BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2; BasicBlock *OldDest = Cond->getZExtValue() ? Dest2 : Dest1; //cerr << "Function: " << T->getParent()->getParent() // << "\nRemoving branch from " << T->getParent() // << "\n\nTo: " << OldDest << endl; // Let the basic block know that we are letting go of it. Based on this, // it will adjust it's PHI nodes. OldDest->removePredecessor(BB); // Replace the conditional branch with an unconditional one. Builder.CreateBr(Destination); BI->eraseFromParent(); return true; } if (Dest2 == Dest1) { // Conditional branch to same location? // This branch matches something like this: // br bool %cond, label %Dest, label %Dest // and changes it into: br label %Dest // Let the basic block know that we are letting go of one copy of it. assert(BI->getParent() && "Terminator not inserted in block!"); Dest1->removePredecessor(BI->getParent()); // Replace the conditional branch with an unconditional one. Builder.CreateBr(Dest1); Value *Cond = BI->getCondition(); BI->eraseFromParent(); if (DeleteDeadConditions) RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI); return true; } return false; } if (SwitchInst *SI = dyn_cast(T)) { // If we are switching on a constant, we can convert the switch to an // unconditional branch. ConstantInt *CI = dyn_cast(SI->getCondition()); BasicBlock *DefaultDest = SI->getDefaultDest(); BasicBlock *TheOnlyDest = DefaultDest; // If the default is unreachable, ignore it when searching for TheOnlyDest. if (isa(DefaultDest->getFirstNonPHIOrDbg()) && SI->getNumCases() > 0) { TheOnlyDest = SI->case_begin().getCaseSuccessor(); } // Figure out which case it goes to. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { // Found case matching a constant operand? if (i.getCaseValue() == CI) { TheOnlyDest = i.getCaseSuccessor(); break; } // Check to see if this branch is going to the same place as the default // dest. If so, eliminate it as an explicit compare. if (i.getCaseSuccessor() == DefaultDest) { MDNode *MD = SI->getMetadata(LLVMContext::MD_prof); unsigned NCases = SI->getNumCases(); // Fold the case metadata into the default if there will be any branches // left, unless the metadata doesn't match the switch. if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) { // Collect branch weights into a vector. SmallVector Weights; for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; ++MD_i) { ConstantInt *CI = mdconst::dyn_extract(MD->getOperand(MD_i)); assert(CI); Weights.push_back(CI->getValue().getZExtValue()); } // Merge weight of this case to the default weight. unsigned idx = i.getCaseIndex(); Weights[0] += Weights[idx+1]; // Remove weight for this case. std::swap(Weights[idx+1], Weights.back()); Weights.pop_back(); SI->setMetadata(LLVMContext::MD_prof, MDBuilder(BB->getContext()). createBranchWeights(Weights)); } // Remove this entry. DefaultDest->removePredecessor(SI->getParent()); SI->removeCase(i); --i; --e; continue; } // Otherwise, check to see if the switch only branches to one destination. // We do this by reseting "TheOnlyDest" to null when we find two non-equal // destinations. if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = nullptr; } if (CI && !TheOnlyDest) { // Branching on a constant, but not any of the cases, go to the default // successor. TheOnlyDest = SI->getDefaultDest(); } // If we found a single destination that we can fold the switch into, do so // now. if (TheOnlyDest) { // Insert the new branch. Builder.CreateBr(TheOnlyDest); BasicBlock *BB = SI->getParent(); // Remove entries from PHI nodes which we no longer branch to... for (BasicBlock *Succ : SI->successors()) { // Found case matching a constant operand? if (Succ == TheOnlyDest) TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest else Succ->removePredecessor(BB); } // Delete the old switch. Value *Cond = SI->getCondition(); SI->eraseFromParent(); if (DeleteDeadConditions) RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI); return true; } if (SI->getNumCases() == 1) { // Otherwise, we can fold this switch into a conditional branch // instruction if it has only one non-default destination. SwitchInst::CaseIt FirstCase = SI->case_begin(); Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), FirstCase.getCaseValue(), "cond"); // Insert the new branch. BranchInst *NewBr = Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(), SI->getDefaultDest()); MDNode *MD = SI->getMetadata(LLVMContext::MD_prof); if (MD && MD->getNumOperands() == 3) { ConstantInt *SICase = mdconst::dyn_extract(MD->getOperand(2)); ConstantInt *SIDef = mdconst::dyn_extract(MD->getOperand(1)); assert(SICase && SIDef); // The TrueWeight should be the weight for the single case of SI. NewBr->setMetadata(LLVMContext::MD_prof, MDBuilder(BB->getContext()). createBranchWeights(SICase->getValue().getZExtValue(), SIDef->getValue().getZExtValue())); } // Update make.implicit metadata to the newly-created conditional branch. MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit); if (MakeImplicitMD) NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD); // Delete the old switch. SI->eraseFromParent(); return true; } return false; } if (IndirectBrInst *IBI = dyn_cast(T)) { // indirectbr blockaddress(@F, @BB) -> br label @BB if (BlockAddress *BA = dyn_cast(IBI->getAddress()->stripPointerCasts())) { BasicBlock *TheOnlyDest = BA->getBasicBlock(); // Insert the new branch. Builder.CreateBr(TheOnlyDest); for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { if (IBI->getDestination(i) == TheOnlyDest) TheOnlyDest = nullptr; else IBI->getDestination(i)->removePredecessor(IBI->getParent()); } Value *Address = IBI->getAddress(); IBI->eraseFromParent(); if (DeleteDeadConditions) RecursivelyDeleteTriviallyDeadInstructions(Address, TLI); // If we didn't find our destination in the IBI successor list, then we // have undefined behavior. Replace the unconditional branch with an // 'unreachable' instruction. if (TheOnlyDest) { BB->getTerminator()->eraseFromParent(); new UnreachableInst(BB->getContext(), BB); } return true; } } return false; } //===----------------------------------------------------------------------===// // Local dead code elimination. // /// isInstructionTriviallyDead - Return true if the result produced by the /// instruction is not used, and the instruction has no side effects. /// bool llvm::isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI) { if (!I->use_empty() || isa(I)) return false; // We don't want the landingpad-like instructions removed by anything this // general. if (I->isEHPad()) return false; // We don't want debug info removed by anything this general, unless // debug info is empty. if (DbgDeclareInst *DDI = dyn_cast(I)) { if (DDI->getAddress()) return false; return true; } if (DbgValueInst *DVI = dyn_cast(I)) { if (DVI->getValue()) return false; return true; } if (!I->mayHaveSideEffects()) return true; // Special case intrinsics that "may have side effects" but can be deleted // when dead. if (IntrinsicInst *II = dyn_cast(I)) { // Safe to delete llvm.stacksave if dead. if (II->getIntrinsicID() == Intrinsic::stacksave) return true; // Lifetime intrinsics are dead when their right-hand is undef. if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) return isa(II->getArgOperand(1)); // Assumptions are dead if their condition is trivially true. if (II->getIntrinsicID() == Intrinsic::assume) { if (ConstantInt *Cond = dyn_cast(II->getArgOperand(0))) return !Cond->isZero(); return false; } } if (isAllocLikeFn(I, TLI)) return true; if (CallInst *CI = isFreeCall(I, TLI)) if (Constant *C = dyn_cast(CI->getArgOperand(0))) return C->isNullValue() || isa(C); return false; } /// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a /// trivially dead instruction, delete it. If that makes any of its operands /// trivially dead, delete them too, recursively. Return true if any /// instructions were deleted. bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI) { Instruction *I = dyn_cast(V); if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI)) return false; SmallVector DeadInsts; DeadInsts.push_back(I); do { I = DeadInsts.pop_back_val(); // Null out all of the instruction's operands to see if any operand becomes // dead as we go. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Value *OpV = I->getOperand(i); I->setOperand(i, nullptr); if (!OpV->use_empty()) continue; // If the operand is an instruction that became dead as we nulled out the // operand, and if it is 'trivially' dead, delete it in a future loop // iteration. if (Instruction *OpI = dyn_cast(OpV)) if (isInstructionTriviallyDead(OpI, TLI)) DeadInsts.push_back(OpI); } I->eraseFromParent(); } while (!DeadInsts.empty()); return true; } /// areAllUsesEqual - Check whether the uses of a value are all the same. /// This is similar to Instruction::hasOneUse() except this will also return /// true when there are no uses or multiple uses that all refer to the same /// value. static bool areAllUsesEqual(Instruction *I) { Value::user_iterator UI = I->user_begin(); Value::user_iterator UE = I->user_end(); if (UI == UE) return true; User *TheUse = *UI; for (++UI; UI != UE; ++UI) { if (*UI != TheUse) return false; } return true; } /// RecursivelyDeleteDeadPHINode - If the specified value is an effectively /// dead PHI node, due to being a def-use chain of single-use nodes that /// either forms a cycle or is terminated by a trivially dead instruction, /// delete it. If that makes any of its operands trivially dead, delete them /// too, recursively. Return true if a change was made. bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI) { SmallPtrSet Visited; for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects(); I = cast(*I->user_begin())) { if (I->use_empty()) return RecursivelyDeleteTriviallyDeadInstructions(I, TLI); // If we find an instruction more than once, we're on a cycle that // won't prove fruitful. if (!Visited.insert(I).second) { // Break the cycle and delete the instruction and its operands. I->replaceAllUsesWith(UndefValue::get(I->getType())); (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI); return true; } } return false; } static bool simplifyAndDCEInstruction(Instruction *I, SmallSetVector &WorkList, const DataLayout &DL, const TargetLibraryInfo *TLI) { if (isInstructionTriviallyDead(I, TLI)) { // Null out all of the instruction's operands to see if any operand becomes // dead as we go. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Value *OpV = I->getOperand(i); I->setOperand(i, nullptr); if (!OpV->use_empty() || I == OpV) continue; // If the operand is an instruction that became dead as we nulled out the // operand, and if it is 'trivially' dead, delete it in a future loop // iteration. if (Instruction *OpI = dyn_cast(OpV)) if (isInstructionTriviallyDead(OpI, TLI)) WorkList.insert(OpI); } I->eraseFromParent(); return true; } if (Value *SimpleV = SimplifyInstruction(I, DL)) { // Add the users to the worklist. CAREFUL: an instruction can use itself, // in the case of a phi node. for (User *U : I->users()) if (U != I) WorkList.insert(cast(U)); // Replace the instruction with its simplified value. I->replaceAllUsesWith(SimpleV); I->eraseFromParent(); return true; } return false; } /// SimplifyInstructionsInBlock - Scan the specified basic block and try to /// simplify any instructions in it and recursively delete dead instructions. /// /// This returns true if it changed the code, note that it can delete /// instructions in other blocks as well in this block. bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI) { bool MadeChange = false; const DataLayout &DL = BB->getModule()->getDataLayout(); #ifndef NDEBUG // In debug builds, ensure that the terminator of the block is never replaced // or deleted by these simplifications. The idea of simplification is that it // cannot introduce new instructions, and there is no way to replace the // terminator of a block without introducing a new instruction. AssertingVH TerminatorVH(&BB->back()); #endif SmallSetVector WorkList; // Iterate over the original function, only adding insts to the worklist // if they actually need to be revisited. This avoids having to pre-init // the worklist with the entire function's worth of instructions. for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) { assert(!BI->isTerminator()); Instruction *I = &*BI; ++BI; // We're visiting this instruction now, so make sure it's not in the // worklist from an earlier visit. if (!WorkList.count(I)) MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); } while (!WorkList.empty()) { Instruction *I = WorkList.pop_back_val(); MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); } return MadeChange; } //===----------------------------------------------------------------------===// // Control Flow Graph Restructuring. // /// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this /// method is called when we're about to delete Pred as a predecessor of BB. If /// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred. /// /// Unlike the removePredecessor method, this attempts to simplify uses of PHI /// nodes that collapse into identity values. For example, if we have: /// x = phi(1, 0, 0, 0) /// y = and x, z /// /// .. and delete the predecessor corresponding to the '1', this will attempt to /// recursively fold the and to 0. void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) { // This only adjusts blocks with PHI nodes. if (!isa(BB->begin())) return; // Remove the entries for Pred from the PHI nodes in BB, but do not simplify // them down. This will leave us with single entry phi nodes and other phis // that can be removed. BB->removePredecessor(Pred, true); WeakVH PhiIt = &BB->front(); while (PHINode *PN = dyn_cast(PhiIt)) { PhiIt = &*++BasicBlock::iterator(cast(PhiIt)); Value *OldPhiIt = PhiIt; if (!recursivelySimplifyInstruction(PN)) continue; // If recursive simplification ended up deleting the next PHI node we would // iterate to, then our iterator is invalid, restart scanning from the top // of the block. if (PhiIt != OldPhiIt) PhiIt = &BB->front(); } } /// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its /// predecessor is known to have one successor (DestBB!). Eliminate the edge /// between them, moving the instructions in the predecessor into DestBB and /// deleting the predecessor block. /// void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) { // If BB has single-entry PHI nodes, fold them. while (PHINode *PN = dyn_cast(DestBB->begin())) { Value *NewVal = PN->getIncomingValue(0); // Replace self referencing PHI with undef, it must be dead. if (NewVal == PN) NewVal = UndefValue::get(PN->getType()); PN->replaceAllUsesWith(NewVal); PN->eraseFromParent(); } BasicBlock *PredBB = DestBB->getSinglePredecessor(); assert(PredBB && "Block doesn't have a single predecessor!"); // Zap anything that took the address of DestBB. Not doing this will give the // address an invalid value. if (DestBB->hasAddressTaken()) { BlockAddress *BA = BlockAddress::get(DestBB); Constant *Replacement = ConstantInt::get(llvm::Type::getInt32Ty(BA->getContext()), 1); BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement, BA->getType())); BA->destroyConstant(); } // Anything that branched to PredBB now branches to DestBB. PredBB->replaceAllUsesWith(DestBB); // Splice all the instructions from PredBB to DestBB. PredBB->getTerminator()->eraseFromParent(); DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); // If the PredBB is the entry block of the function, move DestBB up to // become the entry block after we erase PredBB. if (PredBB == &DestBB->getParent()->getEntryBlock()) DestBB->moveAfter(PredBB); if (DT) { BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock(); DT->changeImmediateDominator(DestBB, PredBBIDom); DT->eraseNode(PredBB); } // Nuke BB. PredBB->eraseFromParent(); } /// CanMergeValues - Return true if we can choose one of these values to use /// in place of the other. Note that we will always choose the non-undef /// value to keep. static bool CanMergeValues(Value *First, Value *Second) { return First == Second || isa(First) || isa(Second); } /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an /// almost-empty BB ending in an unconditional branch to Succ, into Succ. /// /// Assumption: Succ is the single successor for BB. /// static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!"); DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " << Succ->getName() << "\n"); // Shortcut, if there is only a single predecessor it must be BB and merging // is always safe if (Succ->getSinglePredecessor()) return true; // Make a list of the predecessors of BB SmallPtrSet BBPreds(pred_begin(BB), pred_end(BB)); // Look at all the phi nodes in Succ, to see if they present a conflict when // merging these blocks for (BasicBlock::iterator I = Succ->begin(); isa(I); ++I) { PHINode *PN = cast(I); // If the incoming value from BB is again a PHINode in // BB which has the same incoming value for *PI as PN does, we can // merge the phi nodes and then the blocks can still be merged PHINode *BBPN = dyn_cast(PN->getIncomingValueForBlock(BB)); if (BBPN && BBPN->getParent() == BB) { for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) { BasicBlock *IBB = PN->getIncomingBlock(PI); if (BBPreds.count(IBB) && !CanMergeValues(BBPN->getIncomingValueForBlock(IBB), PN->getIncomingValue(PI))) { DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " << Succ->getName() << " is conflicting with " << BBPN->getName() << " with regard to common predecessor " << IBB->getName() << "\n"); return false; } } } else { Value* Val = PN->getIncomingValueForBlock(BB); for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) { // See if the incoming value for the common predecessor is equal to the // one for BB, in which case this phi node will not prevent the merging // of the block. BasicBlock *IBB = PN->getIncomingBlock(PI); if (BBPreds.count(IBB) && !CanMergeValues(Val, PN->getIncomingValue(PI))) { DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " << Succ->getName() << " is conflicting with regard to common " << "predecessor " << IBB->getName() << "\n"); return false; } } } } return true; } typedef SmallVector PredBlockVector; typedef DenseMap IncomingValueMap; /// \brief Determines the value to use as the phi node input for a block. /// /// Select between \p OldVal any value that we know flows from \p BB /// to a particular phi on the basis of which one (if either) is not /// undef. Update IncomingValues based on the selected value. /// /// \param OldVal The value we are considering selecting. /// \param BB The block that the value flows in from. /// \param IncomingValues A map from block-to-value for other phi inputs /// that we have examined. /// /// \returns the selected value. static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB, IncomingValueMap &IncomingValues) { if (!isa(OldVal)) { assert((!IncomingValues.count(BB) || IncomingValues.find(BB)->second == OldVal) && "Expected OldVal to match incoming value from BB!"); IncomingValues.insert(std::make_pair(BB, OldVal)); return OldVal; } IncomingValueMap::const_iterator It = IncomingValues.find(BB); if (It != IncomingValues.end()) return It->second; return OldVal; } /// \brief Create a map from block to value for the operands of a /// given phi. /// /// Create a map from block to value for each non-undef value flowing /// into \p PN. /// /// \param PN The phi we are collecting the map for. /// \param IncomingValues [out] The map from block to value for this phi. static void gatherIncomingValuesToPhi(PHINode *PN, IncomingValueMap &IncomingValues) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *BB = PN->getIncomingBlock(i); Value *V = PN->getIncomingValue(i); if (!isa(V)) IncomingValues.insert(std::make_pair(BB, V)); } } /// \brief Replace the incoming undef values to a phi with the values /// from a block-to-value map. /// /// \param PN The phi we are replacing the undefs in. /// \param IncomingValues A map from block to value. static void replaceUndefValuesInPhi(PHINode *PN, const IncomingValueMap &IncomingValues) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); if (!isa(V)) continue; BasicBlock *BB = PN->getIncomingBlock(i); IncomingValueMap::const_iterator It = IncomingValues.find(BB); if (It == IncomingValues.end()) continue; PN->setIncomingValue(i, It->second); } } /// \brief Replace a value flowing from a block to a phi with /// potentially multiple instances of that value flowing from the /// block's predecessors to the phi. /// /// \param BB The block with the value flowing into the phi. /// \param BBPreds The predecessors of BB. /// \param PN The phi that we are updating. static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB, const PredBlockVector &BBPreds, PHINode *PN) { Value *OldVal = PN->removeIncomingValue(BB, false); assert(OldVal && "No entry in PHI for Pred BB!"); IncomingValueMap IncomingValues; // We are merging two blocks - BB, and the block containing PN - and // as a result we need to redirect edges from the predecessors of BB // to go to the block containing PN, and update PN // accordingly. Since we allow merging blocks in the case where the // predecessor and successor blocks both share some predecessors, // and where some of those common predecessors might have undef // values flowing into PN, we want to rewrite those values to be // consistent with the non-undef values. gatherIncomingValuesToPhi(PN, IncomingValues); // If this incoming value is one of the PHI nodes in BB, the new entries // in the PHI node are the entries from the old PHI. if (isa(OldVal) && cast(OldVal)->getParent() == BB) { PHINode *OldValPN = cast(OldVal); for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) { // Note that, since we are merging phi nodes and BB and Succ might // have common predecessors, we could end up with a phi node with // identical incoming branches. This will be cleaned up later (and // will trigger asserts if we try to clean it up now, without also // simplifying the corresponding conditional branch). BasicBlock *PredBB = OldValPN->getIncomingBlock(i); Value *PredVal = OldValPN->getIncomingValue(i); Value *Selected = selectIncomingValueForBlock(PredVal, PredBB, IncomingValues); // And add a new incoming value for this predecessor for the // newly retargeted branch. PN->addIncoming(Selected, PredBB); } } else { for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) { // Update existing incoming values in PN for this // predecessor of BB. BasicBlock *PredBB = BBPreds[i]; Value *Selected = selectIncomingValueForBlock(OldVal, PredBB, IncomingValues); // And add a new incoming value for this predecessor for the // newly retargeted branch. PN->addIncoming(Selected, PredBB); } } replaceUndefValuesInPhi(PN, IncomingValues); } /// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an /// unconditional branch, and contains no instructions other than PHI nodes, /// potential side-effect free intrinsics and the branch. If possible, /// eliminate BB by rewriting all the predecessors to branch to the successor /// block and return true. If we can't transform, return false. bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { assert(BB != &BB->getParent()->getEntryBlock() && "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!"); // We can't eliminate infinite loops. BasicBlock *Succ = cast(BB->getTerminator())->getSuccessor(0); if (BB == Succ) return false; // Check to see if merging these blocks would cause conflicts for any of the // phi nodes in BB or Succ. If not, we can safely merge. if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false; // Check for cases where Succ has multiple predecessors and a PHI node in BB // has uses which will not disappear when the PHI nodes are merged. It is // possible to handle such cases, but difficult: it requires checking whether // BB dominates Succ, which is non-trivial to calculate in the case where // Succ has multiple predecessors. Also, it requires checking whether // constructing the necessary self-referential PHI node doesn't introduce any // conflicts; this isn't too difficult, but the previous code for doing this // was incorrect. // // Note that if this check finds a live use, BB dominates Succ, so BB is // something like a loop pre-header (or rarely, a part of an irreducible CFG); // folding the branch isn't profitable in that case anyway. if (!Succ->getSinglePredecessor()) { BasicBlock::iterator BBI = BB->begin(); while (isa(*BBI)) { for (Use &U : BBI->uses()) { if (PHINode* PN = dyn_cast(U.getUser())) { if (PN->getIncomingBlock(U) != BB) return false; } else { return false; } } ++BBI; } } DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB); if (isa(Succ->begin())) { // If there is more than one pred of succ, and there are PHI nodes in // the successor, then we need to add incoming edges for the PHI nodes // const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB)); // Loop over all of the PHI nodes in the successor of BB. for (BasicBlock::iterator I = Succ->begin(); isa(I); ++I) { PHINode *PN = cast(I); redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN); } } if (Succ->getSinglePredecessor()) { // BB is the only predecessor of Succ, so Succ will end up with exactly // the same predecessors BB had. // Copy over any phi, debug or lifetime instruction. BB->getTerminator()->eraseFromParent(); Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(), BB->getInstList()); } else { while (PHINode *PN = dyn_cast(&BB->front())) { // We explicitly check for such uses in CanPropagatePredecessorsForPHIs. assert(PN->use_empty() && "There shouldn't be any uses here!"); PN->eraseFromParent(); } } // Everything that jumped to BB now goes to Succ. BB->replaceAllUsesWith(Succ); if (!Succ->hasName()) Succ->takeName(BB); BB->eraseFromParent(); // Delete the old basic block. return true; } /// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI /// nodes in this block. This doesn't try to be clever about PHI nodes /// which differ only in the order of the incoming values, but instcombine /// orders them so it usually won't matter. /// bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { // This implementation doesn't currently consider undef operands // specially. Theoretically, two phis which are identical except for // one having an undef where the other doesn't could be collapsed. struct PHIDenseMapInfo { static PHINode *getEmptyKey() { return DenseMapInfo::getEmptyKey(); } static PHINode *getTombstoneKey() { return DenseMapInfo::getTombstoneKey(); } static unsigned getHashValue(PHINode *PN) { // Compute a hash value on the operands. Instcombine will likely have // sorted them, which helps expose duplicates, but we have to check all // the operands to be safe in case instcombine hasn't run. return static_cast(hash_combine( hash_combine_range(PN->value_op_begin(), PN->value_op_end()), hash_combine_range(PN->block_begin(), PN->block_end()))); } static bool isEqual(PHINode *LHS, PHINode *RHS) { if (LHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getEmptyKey() || RHS == getTombstoneKey()) return LHS == RHS; return LHS->isIdenticalTo(RHS); } }; // Set of unique PHINodes. DenseSet PHISet; // Examine each PHI. bool Changed = false; for (auto I = BB->begin(); PHINode *PN = dyn_cast(I++);) { auto Inserted = PHISet.insert(PN); if (!Inserted.second) { // A duplicate. Replace this PHI with its duplicate. PN->replaceAllUsesWith(*Inserted.first); PN->eraseFromParent(); Changed = true; // The RAUW can change PHIs that we already visited. Start over from the // beginning. PHISet.clear(); I = BB->begin(); } } return Changed; } /// enforceKnownAlignment - If the specified pointer points to an object that /// we control, modify the object's alignment to PrefAlign. This isn't /// often possible though. If alignment is important, a more reliable approach /// is to simply align all global variables and allocation instructions to /// their preferred alignment from the beginning. /// static unsigned enforceKnownAlignment(Value *V, unsigned Align, unsigned PrefAlign, const DataLayout &DL) { assert(PrefAlign > Align); V = V->stripPointerCasts(); if (AllocaInst *AI = dyn_cast(V)) { // TODO: ideally, computeKnownBits ought to have used // AllocaInst::getAlignment() in its computation already, making // the below max redundant. But, as it turns out, // stripPointerCasts recurses through infinite layers of bitcasts, // while computeKnownBits is not allowed to traverse more than 6 // levels. Align = std::max(AI->getAlignment(), Align); if (PrefAlign <= Align) return Align; // If the preferred alignment is greater than the natural stack alignment // then don't round up. This avoids dynamic stack realignment. if (DL.exceedsNaturalStackAlignment(PrefAlign)) return Align; AI->setAlignment(PrefAlign); return PrefAlign; } if (auto *GO = dyn_cast(V)) { // TODO: as above, this shouldn't be necessary. Align = std::max(GO->getAlignment(), Align); if (PrefAlign <= Align) return Align; // If there is a large requested alignment and we can, bump up the alignment // of the global. If the memory we set aside for the global may not be the // memory used by the final program then it is impossible for us to reliably // enforce the preferred alignment. if (!GO->canIncreaseAlignment()) return Align; GO->setAlignment(PrefAlign); return PrefAlign; } return Align; } /// getOrEnforceKnownAlignment - If the specified pointer has an alignment that /// we can determine, return it, otherwise return 0. If PrefAlign is specified, /// and it is more than the alignment of the ultimate object, see if we can /// increase the alignment of the ultimate object, making this check succeed. unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI, AssumptionCache *AC, const DominatorTree *DT) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType()); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT); unsigned TrailZ = KnownZero.countTrailingOnes(); // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1)); unsigned Align = 1u << std::min(BitWidth - 1, TrailZ); // LLVM doesn't support alignments larger than this currently. Align = std::min(Align, +Value::MaximumAlignment); if (PrefAlign > Align) Align = enforceKnownAlignment(V, Align, PrefAlign, DL); // We don't need to make any adjustment. return Align; } ///===---------------------------------------------------------------------===// /// Dbg Intrinsic utilities /// /// See if there is a dbg.value intrinsic for DIVar before I. static bool LdStHasDebugValue(const DILocalVariable *DIVar, Instruction *I) { // Since we can't guarantee that the original dbg.declare instrinsic // is removed by LowerDbgDeclare(), we need to make sure that we are // not inserting the same dbg.value intrinsic over and over. llvm::BasicBlock::InstListType::iterator PrevI(I); if (PrevI != I->getParent()->getInstList().begin()) { --PrevI; if (DbgValueInst *DVI = dyn_cast(PrevI)) if (DVI->getValue() == I->getOperand(0) && DVI->getOffset() == 0 && DVI->getVariable() == DIVar) return true; } return false; } /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value /// that has an associated llvm.dbg.decl intrinsic. bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI, DIBuilder &Builder) { auto *DIVar = DDI->getVariable(); auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); if (LdStHasDebugValue(DIVar, SI)) return true; // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. Argument *ExtendedArg = nullptr; if (ZExtInst *ZExt = dyn_cast(SI->getOperand(0))) ExtendedArg = dyn_cast(ZExt->getOperand(0)); if (SExtInst *SExt = dyn_cast(SI->getOperand(0))) ExtendedArg = dyn_cast(SExt->getOperand(0)); if (ExtendedArg) { // We're now only describing a subset of the variable. The piece we're // describing will always be smaller than the variable size, because // VariableSize == Size of Alloca described by DDI. Since SI stores // to the alloca described by DDI, if it's first operand is an extend, // we're guaranteed that before extension, the value was narrower than // the size of the alloca, hence the size of the described variable. SmallVector NewDIExpr; unsigned PieceOffset = 0; // If this already is a bit piece, we drop the bit piece from the expression // and record the offset. if (DIExpr->isBitPiece()) { NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()-3); PieceOffset = DIExpr->getBitPieceOffset(); } else { NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); } NewDIExpr.push_back(dwarf::DW_OP_bit_piece); NewDIExpr.push_back(PieceOffset); //Offset const DataLayout &DL = DDI->getModule()->getDataLayout(); NewDIExpr.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, Builder.createExpression(NewDIExpr), DDI->getDebugLoc(), SI); } else Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr, DDI->getDebugLoc(), SI); return true; } /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value /// that has an associated llvm.dbg.decl intrinsic. bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, LoadInst *LI, DIBuilder &Builder) { auto *DIVar = DDI->getVariable(); auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); if (LdStHasDebugValue(DIVar, LI)) return true; // We are now tracking the loaded value instead of the address. In the // future if multi-location support is added to the IR, it might be // preferable to keep tracking both the loaded value and the original // address in case the alloca can not be elided. Instruction *DbgValue = Builder.insertDbgValueIntrinsic( LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr); DbgValue->insertAfter(LI); return true; } /// Determine whether this alloca is either a VLA or an array. static bool isArray(AllocaInst *AI) { return AI->isArrayAllocation() || AI->getType()->getElementType()->isArrayTy(); } /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set /// of llvm.dbg.value intrinsics. bool llvm::LowerDbgDeclare(Function &F) { DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector Dbgs; for (auto &FI : F) for (Instruction &BI : FI) if (auto DDI = dyn_cast(&BI)) Dbgs.push_back(DDI); if (Dbgs.empty()) return false; for (auto &I : Dbgs) { DbgDeclareInst *DDI = I; AllocaInst *AI = dyn_cast_or_null(DDI->getAddress()); // If this is an alloca for a scalar variable, insert a dbg.value // at each load and store to the alloca and erase the dbg.declare. // The dbg.values allow tracking a variable even if it is not // stored on the stack, while the dbg.declare can only describe // the stack slot (and at a lexical-scope granularity). Later // passes will attempt to elide the stack slot. if (AI && !isArray(AI)) { for (User *U : AI->users()) if (StoreInst *SI = dyn_cast(U)) ConvertDebugDeclareToDebugValue(DDI, SI, DIB); else if (LoadInst *LI = dyn_cast(U)) ConvertDebugDeclareToDebugValue(DDI, LI, DIB); else if (CallInst *CI = dyn_cast(U)) { // This is a call by-value or some other instruction that // takes a pointer to the variable. Insert a *value* // intrinsic that describes the alloca. SmallVector NewDIExpr; auto *DIExpr = DDI->getExpression(); NewDIExpr.push_back(dwarf::DW_OP_deref); NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(), DIB.createExpression(NewDIExpr), DDI->getDebugLoc(), CI); } DDI->eraseFromParent(); } } return true; } /// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the /// alloca 'V', if any. DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { if (auto *L = LocalAsMetadata::getIfExists(V)) if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L)) for (User *U : MDV->users()) if (DbgDeclareInst *DDI = dyn_cast(U)) return DDI; return nullptr; } bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, Instruction *InsertBefore, DIBuilder &Builder, bool Deref, int Offset) { DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address); if (!DDI) return false; DebugLoc Loc = DDI->getDebugLoc(); auto *DIVar = DDI->getVariable(); auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); if (Deref || Offset) { // Create a copy of the original DIDescriptor for user variable, prepending // "deref" operation to a list of address elements, as new llvm.dbg.declare // will take a value storing address of the memory for variable, not // alloca itself. SmallVector NewDIExpr; if (Deref) NewDIExpr.push_back(dwarf::DW_OP_deref); if (Offset > 0) { NewDIExpr.push_back(dwarf::DW_OP_plus); NewDIExpr.push_back(Offset); } else if (Offset < 0) { NewDIExpr.push_back(dwarf::DW_OP_minus); NewDIExpr.push_back(-Offset); } if (DIExpr) NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); DIExpr = Builder.createExpression(NewDIExpr); } // Insert llvm.dbg.declare immediately after the original alloca, and remove // old llvm.dbg.declare. Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore); DDI->eraseFromParent(); return true; } bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, bool Deref, int Offset) { return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, Deref, Offset); } void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) { BasicBlock *BB = I->getParent(); // Loop over all of the successors, removing BB's entry from any PHI // nodes. for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) (*SI)->removePredecessor(BB); // Insert a call to llvm.trap right before this. This turns the undefined // behavior into a hard fail instead of falling through into random code. if (UseLLVMTrap) { Function *TrapFn = Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); CallInst *CallTrap = CallInst::Create(TrapFn, "", I); CallTrap->setDebugLoc(I->getDebugLoc()); } new UnreachableInst(I->getContext(), I); // All instructions after this are dead. BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end(); while (BBI != BBE) { if (!BBI->use_empty()) BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); BB->getInstList().erase(BBI++); } } /// changeToCall - Convert the specified invoke into a normal call. static void changeToCall(InvokeInst *II) { SmallVector Args(II->arg_begin(), II->arg_end()); SmallVector OpBundles; II->getOperandBundlesAsDefs(OpBundles); CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles, "", II); NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); NewCall->setDebugLoc(II->getDebugLoc()); II->replaceAllUsesWith(NewCall); // Follow the call by a branch to the normal destination. BranchInst::Create(II->getNormalDest(), II); // Update PHI nodes in the unwind destination II->getUnwindDest()->removePredecessor(II->getParent()); II->eraseFromParent(); } static bool markAliveBlocks(Function &F, SmallPtrSetImpl &Reachable) { SmallVector Worklist; BasicBlock *BB = &F.front(); Worklist.push_back(BB); Reachable.insert(BB); bool Changed = false; do { BB = Worklist.pop_back_val(); // Do a quick scan of the basic block, turning any obviously unreachable // instructions into LLVM unreachable insts. The instruction combining pass // canonicalizes unreachable insts into stores to null or undef. for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ // Assumptions that are known to be false are equivalent to unreachable. // Also, if the condition is undefined, then we make the choice most // beneficial to the optimizer, and choose that to also be unreachable. if (IntrinsicInst *II = dyn_cast(BBI)) if (II->getIntrinsicID() == Intrinsic::assume) { bool MakeUnreachable = false; if (isa(II->getArgOperand(0))) MakeUnreachable = true; else if (ConstantInt *Cond = dyn_cast(II->getArgOperand(0))) MakeUnreachable = Cond->isZero(); if (MakeUnreachable) { // Don't insert a call to llvm.trap right before the unreachable. changeToUnreachable(&*BBI, false); Changed = true; break; } } if (CallInst *CI = dyn_cast(BBI)) { if (CI->doesNotReturn()) { // If we found a call to a no-return function, insert an unreachable // instruction after it. Make sure there isn't *already* one there // though. ++BBI; if (!isa(BBI)) { // Don't insert a call to llvm.trap right before the unreachable. changeToUnreachable(&*BBI, false); Changed = true; } break; } } // Store to undef and store to null are undefined and used to signal that // they should be changed to unreachable by passes that can't modify the // CFG. if (StoreInst *SI = dyn_cast(BBI)) { // Don't touch volatile stores. if (SI->isVolatile()) continue; Value *Ptr = SI->getOperand(1); if (isa(Ptr) || (isa(Ptr) && SI->getPointerAddressSpace() == 0)) { changeToUnreachable(SI, true); Changed = true; break; } } } TerminatorInst *Terminator = BB->getTerminator(); if (auto *II = dyn_cast(Terminator)) { // Turn invokes that call 'nounwind' functions into ordinary calls. Value *Callee = II->getCalledValue(); if (isa(Callee) || isa(Callee)) { changeToUnreachable(II, true); Changed = true; } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) { if (II->use_empty() && II->onlyReadsMemory()) { // jump to the normal destination branch. BranchInst::Create(II->getNormalDest(), II); II->getUnwindDest()->removePredecessor(II->getParent()); II->eraseFromParent(); } else changeToCall(II); Changed = true; } } else if (auto *CatchSwitch = dyn_cast(Terminator)) { // Remove catchpads which cannot be reached. struct CatchPadDenseMapInfo { static CatchPadInst *getEmptyKey() { return DenseMapInfo::getEmptyKey(); } static CatchPadInst *getTombstoneKey() { return DenseMapInfo::getTombstoneKey(); } static unsigned getHashValue(CatchPadInst *CatchPad) { return static_cast(hash_combine_range( CatchPad->value_op_begin(), CatchPad->value_op_end())); } static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) { if (LHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getEmptyKey() || RHS == getTombstoneKey()) return LHS == RHS; return LHS->isIdenticalTo(RHS); } }; // Set of unique CatchPads. SmallDenseMap> HandlerSet; detail::DenseSetEmpty Empty; for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(), E = CatchSwitch->handler_end(); I != E; ++I) { BasicBlock *HandlerBB = *I; auto *CatchPad = cast(HandlerBB->getFirstNonPHI()); if (!HandlerSet.insert({CatchPad, Empty}).second) { CatchSwitch->removeHandler(I); --I; --E; Changed = true; } } } Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.insert(*SI).second) Worklist.push_back(*SI); } while (!Worklist.empty()); return Changed; } void llvm::removeUnwindEdge(BasicBlock *BB) { TerminatorInst *TI = BB->getTerminator(); if (auto *II = dyn_cast(TI)) { changeToCall(II); return; } TerminatorInst *NewTI; BasicBlock *UnwindDest; if (auto *CRI = dyn_cast(TI)) { NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI); UnwindDest = CRI->getUnwindDest(); } else if (auto *CatchSwitch = dyn_cast(TI)) { auto *NewCatchSwitch = CatchSwitchInst::Create( CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(), CatchSwitch->getName(), CatchSwitch); for (BasicBlock *PadBB : CatchSwitch->handlers()) NewCatchSwitch->addHandler(PadBB); NewTI = NewCatchSwitch; UnwindDest = CatchSwitch->getUnwindDest(); } else { llvm_unreachable("Could not find unwind successor"); } NewTI->takeName(TI); NewTI->setDebugLoc(TI->getDebugLoc()); UnwindDest->removePredecessor(BB); TI->replaceAllUsesWith(NewTI); TI->eraseFromParent(); } /// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even /// if they are in a dead cycle. Return true if a change was made, false /// otherwise. bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) { SmallPtrSet Reachable; bool Changed = markAliveBlocks(F, Reachable); // If there are unreachable blocks in the CFG... if (Reachable.size() == F.size()) return Changed; assert(Reachable.size() < F.size()); NumRemoved += F.size()-Reachable.size(); // Loop over all of the basic blocks that are not reachable, dropping all of // their internal references... for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { if (Reachable.count(&*BB)) continue; for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE; ++SI) if (Reachable.count(*SI)) (*SI)->removePredecessor(&*BB); if (LVI) LVI->eraseBlock(&*BB); BB->dropAllReferences(); } for (Function::iterator I = ++F.begin(); I != F.end();) if (!Reachable.count(&*I)) I = F.getBasicBlockList().erase(I); else ++I; return true; } void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef KnownIDs) { SmallVector, 4> Metadata; K->dropUnknownNonDebugMetadata(KnownIDs); K->getAllMetadataOtherThanDebugLoc(Metadata); for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { unsigned Kind = Metadata[i].first; MDNode *JMD = J->getMetadata(Kind); MDNode *KMD = Metadata[i].second; switch (Kind) { default: K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); case LLVMContext::MD_tbaa: K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); break; case LLVMContext::MD_alias_scope: K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD)); break; case LLVMContext::MD_noalias: K->setMetadata(Kind, MDNode::intersect(JMD, KMD)); break; case LLVMContext::MD_range: K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD)); break; case LLVMContext::MD_fpmath: K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); break; case LLVMContext::MD_invariant_load: // Only set the !invariant.load if it is present in both instructions. K->setMetadata(Kind, JMD); break; case LLVMContext::MD_nonnull: // Only set the !nonnull if it is present in both instructions. K->setMetadata(Kind, JMD); break; case LLVMContext::MD_invariant_group: // Preserve !invariant.group in K. break; case LLVMContext::MD_align: K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; case LLVMContext::MD_dereferenceable: case LLVMContext::MD_dereferenceable_or_null: K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; } } // Set !invariant.group from J if J has it. If both instructions have it // then we will just pick it from J - even when they are different. // Also make sure that K is load or store - f.e. combining bitcast with load // could produce bitcast with invariant.group metadata, which is invalid. // FIXME: we should try to preserve both invariant.group md if they are // different, but right now instruction can only have one invariant.group. if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group)) if (isa(K) || isa(K)) K->setMetadata(LLVMContext::MD_invariant_group, JMD); } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root) { assert(From->getType() == To->getType()); unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE; ) { Use &U = *UI++; if (DT.dominates(Root, U)) { U.set(To); DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " << *To << " in " << *U << "\n"); ++Count; } } return Count; } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlock *BB) { assert(From->getType() == To->getType()); unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE;) { Use &U = *UI++; auto *I = cast(U.getUser()); if (DT.dominates(BB, I->getParent())) { U.set(To); DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " << *To << " in " << *U << "\n"); ++Count; } } return Count; } bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { if (isa(CS.getInstruction())) // Most LLVM intrinsics are things which can never take a safepoint. // As a result, we don't need to have the stack parsable at the // callsite. This is a highly useful optimization since intrinsic // calls are fairly prevalent, particularly in debug builds. return true; // Check if the function is specifically marked as a gc leaf function. if (CS.hasFnAttr("gc-leaf-function")) return true; if (const Function *F = CS.getCalledFunction()) return F->hasFnAttribute("gc-leaf-function"); return false; } + +/// A potential constituent of a bitreverse or bswap expression. See +/// collectBitParts for a fuller explanation. +struct BitPart { + BitPart(Value *P, unsigned BW) : Provider(P) { + Provenance.resize(BW); + } + + /// The Value that this is a bitreverse/bswap of. + Value *Provider; + /// The "provenance" of each bit. Provenance[A] = B means that bit A + /// in Provider becomes bit B in the result of this expression. + SmallVector Provenance; // int8_t means max size is i128. + + enum { Unset = -1 }; +}; + +/// Analyze the specified subexpression and see if it is capable of providing +/// pieces of a bswap or bitreverse. The subexpression provides a potential +/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in +/// the output of the expression came from a corresponding bit in some other +/// value. This function is recursive, and the end result is a mapping of +/// bitnumber to bitnumber. It is the caller's responsibility to validate that +/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse. +/// +/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know +/// that the expression deposits the low byte of %X into the high byte of the +/// result and that all other bits are zero. This expression is accepted and a +/// BitPart is returned with Provider set to %X and Provenance[24-31] set to +/// [0-7]. +/// +/// To avoid revisiting values, the BitPart results are memoized into the +/// provided map. To avoid unnecessary copying of BitParts, BitParts are +/// constructed in-place in the \c BPS map. Because of this \c BPS needs to +/// store BitParts objects, not pointers. As we need the concept of a nullptr +/// BitParts (Value has been analyzed and the analysis failed), we an Optional +/// type instead to provide the same functionality. +/// +/// Because we pass around references into \c BPS, we must use a container that +/// does not invalidate internal references (std::map instead of DenseMap). +/// +static const Optional & +collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, + std::map> &BPS) { + auto I = BPS.find(V); + if (I != BPS.end()) + return I->second; + + auto &Result = BPS[V] = None; + auto BitWidth = cast(V->getType())->getBitWidth(); + + if (Instruction *I = dyn_cast(V)) { + // If this is an or instruction, it may be an inner node of the bswap. + if (I->getOpcode() == Instruction::Or) { + auto &A = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS); + auto &B = collectBitParts(I->getOperand(1), MatchBSwaps, + MatchBitReversals, BPS); + if (!A || !B) + return Result; + + // Try and merge the two together. + if (!A->Provider || A->Provider != B->Provider) + return Result; + + Result = BitPart(A->Provider, BitWidth); + for (unsigned i = 0; i < A->Provenance.size(); ++i) { + if (A->Provenance[i] != BitPart::Unset && + B->Provenance[i] != BitPart::Unset && + A->Provenance[i] != B->Provenance[i]) + return Result = None; + + if (A->Provenance[i] == BitPart::Unset) + Result->Provenance[i] = B->Provenance[i]; + else + Result->Provenance[i] = A->Provenance[i]; + } + + return Result; + } + + // If this is a logical shift by a constant, recurse then shift the result. + if (I->isLogicalShift() && isa(I->getOperand(1))) { + unsigned BitShift = + cast(I->getOperand(1))->getLimitedValue(~0U); + // Ensure the shift amount is defined. + if (BitShift > BitWidth) + return Result; + + auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS); + if (!Res) + return Result; + Result = Res; + + // Perform the "shift" on BitProvenance. + auto &P = Result->Provenance; + if (I->getOpcode() == Instruction::Shl) { + P.erase(std::prev(P.end(), BitShift), P.end()); + P.insert(P.begin(), BitShift, BitPart::Unset); + } else { + P.erase(P.begin(), std::next(P.begin(), BitShift)); + P.insert(P.end(), BitShift, BitPart::Unset); + } + + return Result; + } + + // If this is a logical 'and' with a mask that clears bits, recurse then + // unset the appropriate bits. + if (I->getOpcode() == Instruction::And && + isa(I->getOperand(1))) { + APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); + const APInt &AndMask = cast(I->getOperand(1))->getValue(); + + // Check that the mask allows a multiple of 8 bits for a bswap, for an + // early exit. + unsigned NumMaskedBits = AndMask.countPopulation(); + if (!MatchBitReversals && NumMaskedBits % 8 != 0) + return Result; + + auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS); + if (!Res) + return Result; + Result = Res; + + for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1) + // If the AndMask is zero for this bit, clear the bit. + if ((AndMask & Bit) == 0) + Result->Provenance[i] = BitPart::Unset; + + return Result; + } + } + + // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be + // the input value to the bswap/bitreverse. + Result = BitPart(V, BitWidth); + for (unsigned i = 0; i < BitWidth; ++i) + Result->Provenance[i] = i; + return Result; +} + +static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, + unsigned BitWidth) { + if (From % 8 != To % 8) + return false; + // Convert from bit indices to byte indices and check for a byte reversal. + From >>= 3; + To >>= 3; + BitWidth >>= 3; + return From == BitWidth - To - 1; +} + +static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, + unsigned BitWidth) { + return From == BitWidth - To - 1; +} + +/// Given an OR instruction, check to see if this is a bitreverse +/// idiom. If so, insert the new intrinsic and return true. +bool llvm::recognizeBitReverseOrBSwapIdiom( + Instruction *I, bool MatchBSwaps, bool MatchBitReversals, + SmallVectorImpl &InsertedInsts) { + if (Operator::getOpcode(I) != Instruction::Or) + return false; + if (!MatchBSwaps && !MatchBitReversals) + return false; + IntegerType *ITy = dyn_cast(I->getType()); + if (!ITy || ITy->getBitWidth() > 128) + return false; // Can't do vectors or integers > 128 bits. + unsigned BW = ITy->getBitWidth(); + + // Try to find all the pieces corresponding to the bswap. + std::map> BPS; + auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS); + if (!Res) + return false; + auto &BitProvenance = Res->Provenance; + + // Now, is the bit permutation correct for a bswap or a bitreverse? We can + // only byteswap values with an even number of bytes. + bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true; + for (unsigned i = 0; i < BW; ++i) { + OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); + OKForBitReverse &= + bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); + } + + Intrinsic::ID Intrin; + if (OKForBSwap && MatchBSwaps) + Intrin = Intrinsic::bswap; + else if (OKForBitReverse && MatchBitReversals) + Intrin = Intrinsic::bitreverse; + else + return false; + + Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy); + InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I)); + return true; +} Index: projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp (revision 294609) @@ -1,2593 +1,2616 @@ //===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This is a utility pass used for testing the InstructionSimplify analysis. // The analysis is applied to every instruction, and if it simplifies then the // instruction is replaced by the simplification. If you are looking for a pass // that performs serious instruction folding, use the instcombine pass instead. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; static cl::opt ColdErrorCalls("error-reporting-is-cold", cl::init(true), cl::Hidden, cl::desc("Treat error-reporting calls as cold")); static cl::opt EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden, cl::init(false), cl::desc("Enable unsafe double to float " "shrinking for math lib calls")); //===----------------------------------------------------------------------===// // Helper Functions //===----------------------------------------------------------------------===// static bool ignoreCallingConv(LibFunc::Func Func) { return Func == LibFunc::abs || Func == LibFunc::labs || Func == LibFunc::llabs || Func == LibFunc::strlen; } /// Return true if it only matters that the value is equal or not-equal to zero. static bool isOnlyUsedInZeroEqualityComparison(Value *V) { for (User *U : V->users()) { if (ICmpInst *IC = dyn_cast(U)) if (IC->isEquality()) if (Constant *C = dyn_cast(IC->getOperand(1))) if (C->isNullValue()) continue; // Unknown instruction. return false; } return true; } /// Return true if it is only used in equality comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { for (User *U : V->users()) { if (ICmpInst *IC = dyn_cast(U)) if (IC->isEquality() && IC->getOperand(1) == With) continue; // Unknown instruction. return false; } return true; } static bool callHasFloatingPointArgument(const CallInst *CI) { return std::any_of(CI->op_begin(), CI->op_end(), [](const Use &OI) { return OI->getType()->isFloatingPointTy(); }); } /// \brief Check whether the overloaded unary floating point function /// corresponding to \a Ty is available. static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, LibFunc::Func DoubleFn, LibFunc::Func FloatFn, LibFunc::Func LongDoubleFn) { switch (Ty->getTypeID()) { case Type::FloatTyID: return TLI->has(FloatFn); case Type::DoubleTyID: return TLI->has(DoubleFn); default: return TLI->has(LongDoubleFn); } } /// \brief Check whether we can use unsafe floating point math for /// the function passed as input. static bool canUseUnsafeFPMath(Function *F) { // FIXME: For finer-grain optimization, we need intrinsics to have the same // fast-math flag decorations that are applied to FP instructions. For now, // we have to rely on the function-level unsafe-fp-math attribute to do this // optimization because there's no other way to express that the call can be // relaxed. if (F->hasFnAttribute("unsafe-fp-math")) { Attribute Attr = F->getFnAttribute("unsafe-fp-math"); if (Attr.getValueAsString() == "true") return true; } return false; } /// \brief Returns whether \p F matches the signature expected for the /// string/memory copying library function \p Func. /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset. /// Their fortified (_chk) counterparts are also accepted. static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func) { const DataLayout &DL = F->getParent()->getDataLayout(); FunctionType *FT = F->getFunctionType(); LLVMContext &Context = F->getContext(); Type *PCharTy = Type::getInt8PtrTy(Context); Type *SizeTTy = DL.getIntPtrType(Context); unsigned NumParams = FT->getNumParams(); // All string libfuncs return the same type as the first parameter. if (FT->getReturnType() != FT->getParamType(0)) return false; switch (Func) { default: llvm_unreachable("Can't check signature for non-string-copy libfunc."); case LibFunc::stpncpy_chk: case LibFunc::strncpy_chk: --NumParams; // fallthrough case LibFunc::stpncpy: case LibFunc::strncpy: { if (NumParams != 3 || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != PCharTy || !FT->getParamType(2)->isIntegerTy()) return false; break; } case LibFunc::strcpy_chk: case LibFunc::stpcpy_chk: --NumParams; // fallthrough case LibFunc::stpcpy: case LibFunc::strcpy: { if (NumParams != 2 || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != PCharTy) return false; break; } case LibFunc::memmove_chk: case LibFunc::memcpy_chk: --NumParams; // fallthrough case LibFunc::memmove: case LibFunc::memcpy: { if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || FT->getParamType(2) != SizeTTy) return false; break; } case LibFunc::memset_chk: --NumParams; // fallthrough case LibFunc::memset: { if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || FT->getParamType(2) != SizeTTy) return false; break; } } // If this is a fortified libcall, the last parameter is a size_t. if (NumParams == FT->getNumParams() - 1) return FT->getParamType(FT->getNumParams() - 1) == SizeTTy; return true; } //===----------------------------------------------------------------------===// // String and Memory Library Call Optimizations //===----------------------------------------------------------------------===// Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "strcat" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2|| FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || FT->getParamType(1) != FT->getReturnType()) return nullptr; // Extract some information from the instruction Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); if (Len == 0) return nullptr; --Len; // Unbias length. // Handle the simple, do-nothing case: strcat(x, "") -> x if (Len == 0) return Dst; return emitStrLenMemCpy(Src, Dst, Len, B); } Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { // We need to find the end of the destination string. That's where the // memory is to be moved to. We just generate a call to strlen. Value *DstLen = EmitStrLen(Dst, B, DL, TLI); if (!DstLen) return nullptr; // Now that we have the destination's length, we must index into the // destination's pointer to get the actual memcpy destination (end of // the string .. we're concatenating). Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr"); // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(CpyDst, Src, ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1), 1); return Dst; } Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "strncat" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || FT->getParamType(1) != FT->getReturnType() || !FT->getParamType(2)->isIntegerTy()) return nullptr; // Extract some information from the instruction. Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); uint64_t Len; // We don't do anything if length is not constant. if (ConstantInt *LengthArg = dyn_cast(CI->getArgOperand(2))) Len = LengthArg->getZExtValue(); else return nullptr; // See if we can get the length of the input string. uint64_t SrcLen = GetStringLength(Src); if (SrcLen == 0) return nullptr; --SrcLen; // Unbias length. // Handle the simple, do-nothing cases: // strncat(x, "", c) -> x // strncat(x, c, 0) -> x if (SrcLen == 0 || Len == 0) return Dst; // We don't optimize this case. if (Len < SrcLen) return nullptr; // strncat(x, s, c) -> strcat(x, s) // s is constant so the strcat can be optimized further. return emitStrLenMemCpy(Src, Dst, SrcLen, B); } Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "strchr" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || !FT->getParamType(1)->isIntegerTy(32)) return nullptr; Value *SrcStr = CI->getArgOperand(0); // If the second operand is non-constant, see if we can compute the length // of the input string and turn this into memchr. ConstantInt *CharC = dyn_cast(CI->getArgOperand(1)); if (!CharC) { uint64_t Len = GetStringLength(SrcStr); if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B, DL, TLI); } // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p) return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr"); return nullptr; } // Compute the offset, make sure to handle the case when we're searching for // zero (a weird way to spell strlen). size_t I = (0xFF & CharC->getSExtValue()) == 0 ? Str.size() : Str.find(CharC->getSExtValue()); if (I == StringRef::npos) // Didn't find the char. strchr returns null. return Constant::getNullValue(CI->getType()); // strchr(s+n,c) -> gep(s+n+i,c) return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr"); } Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "strrchr" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || !FT->getParamType(1)->isIntegerTy(32)) return nullptr; Value *SrcStr = CI->getArgOperand(0); ConstantInt *CharC = dyn_cast(CI->getArgOperand(1)); // Cannot fold anything if we're not looking for a constant. if (!CharC) return nullptr; StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { // strrchr(s, 0) -> strchr(s, 0) if (CharC->isZero()) return EmitStrChr(SrcStr, '\0', B, TLI); return nullptr; } // Compute the offset. size_t I = (0xFF & CharC->getSExtValue()) == 0 ? Str.size() : Str.rfind(CharC->getSExtValue()); if (I == StringRef::npos) // Didn't find the char. Return null. return Constant::getNullValue(CI->getType()); // strrchr(s+n,c) -> gep(s+n+i,c) return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr"); } Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "strcmp" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy()) return nullptr; Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); if (Str1P == Str2P) // strcmp(x,x) -> 0 return ConstantInt::get(CI->getType(), 0); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); bool HasStr2 = getConstantStringInfo(Str2P, Str2); // strcmp(x, y) -> cnst (if both x and y are constant strings) if (HasStr1 && HasStr2) return ConstantInt::get(CI->getType(), Str1.compare(Str2)); if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x return B.CreateNeg( B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType())); if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); // strcmp(P, "x") -> memcmp(P, "x", 2) uint64_t Len1 = GetStringLength(Str1P); uint64_t Len2 = GetStringLength(Str2P); if (Len1 && Len2) { return EmitMemCmp(Str1P, Str2P, ConstantInt::get(DL.getIntPtrType(CI->getContext()), std::min(Len1, Len2)), B, DL, TLI); } return nullptr; } Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "strncmp" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy() || !FT->getParamType(2)->isIntegerTy()) return nullptr; Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); if (Str1P == Str2P) // strncmp(x,x,n) -> 0 return ConstantInt::get(CI->getType(), 0); // Get the length argument if it is constant. uint64_t Length; if (ConstantInt *LengthArg = dyn_cast(CI->getArgOperand(2))) Length = LengthArg->getZExtValue(); else return nullptr; if (Length == 0) // strncmp(x,y,0) -> 0 return ConstantInt::get(CI->getType(), 0); if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); bool HasStr2 = getConstantStringInfo(Str2P, Str2); // strncmp(x, y) -> cnst (if both x and y are constant strings) if (HasStr1 && HasStr2) { StringRef SubStr1 = Str1.substr(0, Length); StringRef SubStr2 = Str2.substr(0, Length); return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); } if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x return B.CreateNeg( B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType())); if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); return nullptr; } Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); if (Len == 0) return nullptr; // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1); return Dst; } Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) Value *StrLen = EmitStrLen(Src, B, DL, TLI); return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr; } // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); if (Len == 0) return nullptr; Type *PT = Callee->getFunctionType()->getParamType(0); Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len); Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(Dst, Src, LenV, 1); return DstEnd; } Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy)) return nullptr; Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); Value *LenOp = CI->getArgOperand(2); // See if we can get the length of the input string. uint64_t SrcLen = GetStringLength(Src); if (SrcLen == 0) return nullptr; --SrcLen; if (SrcLen == 0) { // strncpy(x, "", y) -> memset(x, '\0', y, 1) B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); return Dst; } uint64_t Len; if (ConstantInt *LengthArg = dyn_cast(LenOp)) Len = LengthArg->getZExtValue(); else return nullptr; if (Len == 0) return Dst; // strncpy(x, y, 0) -> x // Let strncpy handle the zero padding if (Len > SrcLen + 1) return nullptr; Type *PT = Callee->getFunctionType()->getParamType(0); // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1); return Dst; } Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 1 || FT->getParamType(0) != B.getInt8PtrTy() || !FT->getReturnType()->isIntegerTy()) return nullptr; Value *Src = CI->getArgOperand(0); // Constant folding: strlen("xyz") -> 3 if (uint64_t Len = GetStringLength(Src)) return ConstantInt::get(CI->getType(), Len - 1); // strlen(x?"foo":"bars") --> x ? 3 : 4 if (SelectInst *SI = dyn_cast(Src)) { uint64_t LenTrue = GetStringLength(SI->getTrueValue()); uint64_t LenFalse = GetStringLength(SI->getFalseValue()); if (LenTrue && LenFalse) { Function *Caller = CI->getParent()->getParent(); emitOptimizationRemark(CI->getContext(), "simplify-libcalls", *Caller, SI->getDebugLoc(), "folded strlen(select) to select of constants"); return B.CreateSelect(SI->getCondition(), ConstantInt::get(CI->getType(), LenTrue - 1), ConstantInt::get(CI->getType(), LenFalse - 1)); } } // strlen(x) != 0 --> *x != 0 // strlen(x) == 0 --> *x == 0 if (isOnlyUsedInZeroEqualityComparison(CI)) return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); return nullptr; } Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || FT->getReturnType() != FT->getParamType(0)) return nullptr; StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); // strpbrk(s, "") -> nullptr // strpbrk("", s) -> nullptr if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) return Constant::getNullValue(CI->getType()); // Constant folding. if (HasS1 && HasS2) { size_t I = S1.find_first_of(S2); if (I == StringRef::npos) // No match. return Constant::getNullValue(CI->getType()); return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), "strpbrk"); } // strpbrk(s, "a") -> strchr(s, 'a') if (HasS2 && S2.size() == 1) return EmitStrChr(CI->getArgOperand(0), S2[0], B, TLI); return nullptr; } Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy()) return nullptr; Value *EndPtr = CI->getArgOperand(1); if (isa(EndPtr)) { // With a null EndPtr, this function won't capture the main argument. // It would be readonly too, except that it still may write to errno. CI->addAttribute(1, Attribute::NoCapture); } return nullptr; } Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || !FT->getReturnType()->isIntegerTy()) return nullptr; StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); // strspn(s, "") -> 0 // strspn("", s) -> 0 if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) return Constant::getNullValue(CI->getType()); // Constant folding. if (HasS1 && HasS2) { size_t Pos = S1.find_first_not_of(S2); if (Pos == StringRef::npos) Pos = S1.size(); return ConstantInt::get(CI->getType(), Pos); } return nullptr; } Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || !FT->getReturnType()->isIntegerTy()) return nullptr; StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); // strcspn("", s) -> 0 if (HasS1 && S1.empty()) return Constant::getNullValue(CI->getType()); // Constant folding. if (HasS1 && HasS2) { size_t Pos = S1.find_first_of(S2); if (Pos == StringRef::npos) Pos = S1.size(); return ConstantInt::get(CI->getType(), Pos); } // strcspn(s, "") -> strlen(s) if (HasS2 && S2.empty()) return EmitStrLen(CI->getArgOperand(0), B, DL, TLI); return nullptr; } Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isPointerTy()) return nullptr; // fold strstr(x, x) -> x. if (CI->getArgOperand(0) == CI->getArgOperand(1)) return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI); if (!StrLen) return nullptr; Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), StrLen, B, DL, TLI); if (!StrNCmp) return nullptr; for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) { ICmpInst *Old = cast(*UI++); Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, ConstantInt::getNullValue(StrNCmp->getType()), "cmp"); replaceAllUsesWith(Old, Cmp); } return CI; } // See if either input string is a constant string. StringRef SearchStr, ToFindStr; bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); // fold strstr(x, "") -> x. if (HasStr2 && ToFindStr.empty()) return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); // If both strings are known, constant fold it. if (HasStr1 && HasStr2) { size_t Offset = SearchStr.find(ToFindStr); if (Offset == StringRef::npos) // strstr("foo", "bar") -> null return Constant::getNullValue(CI->getType()); // strstr("abcd", "bc") -> gep((char*)"abcd", 1) Value *Result = CastToCStr(CI->getArgOperand(0), B); Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); return B.CreateBitCast(Result, CI->getType()); } // fold strstr(x, "y") -> strchr(x, 'y'). if (HasStr2 && ToFindStr.size() == 1) { Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI); return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr; } return nullptr; } Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy(32) || !FT->getParamType(2)->isIntegerTy() || !FT->getReturnType()->isPointerTy()) return nullptr; Value *SrcStr = CI->getArgOperand(0); ConstantInt *CharC = dyn_cast(CI->getArgOperand(1)); ConstantInt *LenC = dyn_cast(CI->getArgOperand(2)); // memchr(x, y, 0) -> null if (LenC && LenC->isNullValue()) return Constant::getNullValue(CI->getType()); // From now on we need at least constant length and string. StringRef Str; if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false)) return nullptr; // Truncate the string to LenC. If Str is smaller than LenC we will still only // scan the string, as reading past the end of it is undefined and we can just // return null if we don't find the char. Str = Str.substr(0, LenC->getZExtValue()); // If the char is variable but the input str and length are not we can turn // this memchr call into a simple bit field test. Of course this only works // when the return value is only checked against null. // // It would be really nice to reuse switch lowering here but we can't change // the CFG at this point. // // memchr("\r\n", C, 2) != nullptr -> (C & ((1 << '\r') | (1 << '\n'))) != 0 // after bounds check. if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) { unsigned char Max = *std::max_element(reinterpret_cast(Str.begin()), reinterpret_cast(Str.end())); // Make sure the bit field we're about to create fits in a register on the // target. // FIXME: On a 64 bit architecture this prevents us from using the // interesting range of alpha ascii chars. We could do better by emitting // two bitfields or shifting the range by 64 if no lower chars are used. if (!DL.fitsInLegalInteger(Max + 1)) return nullptr; // For the bit field use a power-of-2 type with at least 8 bits to avoid // creating unnecessary illegal types. unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max)); // Now build the bit field. APInt Bitfield(Width, 0); for (char C : Str) Bitfield.setBit((unsigned char)C); Value *BitfieldC = B.getInt(Bitfield); // First check that the bit field access is within bounds. Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType()); Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width), "memchr.bounds"); // Create code that checks if the given bit is set in the field. Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C); Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits"); // Finally merge both checks and cast to pointer type. The inttoptr // implicitly zexts the i1 to intptr type. return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType()); } // Check if all arguments are constants. If so, we can constant fold. if (!CharC) return nullptr; // Compute the offset. size_t I = Str.find(CharC->getSExtValue() & 0xFF); if (I == StringRef::npos) // Didn't find the char. memchr returns null. return Constant::getNullValue(CI->getType()); // memchr(s+n,c,l) -> gep(s+n+i,c) return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr"); } Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy(32)) return nullptr; Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); if (LHS == RHS) // memcmp(s,s,x) -> 0 return Constant::getNullValue(CI->getType()); // Make sure we have a constant length. ConstantInt *LenC = dyn_cast(CI->getArgOperand(2)); if (!LenC) return nullptr; uint64_t Len = LenC->getZExtValue(); if (Len == 0) // memcmp(s1,s2,0) -> 0 return Constant::getNullValue(CI->getType()); // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS if (Len == 1) { Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), CI->getType(), "lhsv"); Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), CI->getType(), "rhsv"); return B.CreateSub(LHSV, RHSV, "chardiff"); } // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment && getKnownAlignment(RHS, DL, CI) >= PrefAlignment) { Type *LHSPtrTy = IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); Type *RHSPtrTy = IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); Value *LHSV = B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv"); Value *RHSV = B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv"); return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp"); } } // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) StringRef LHSStr, RHSStr; if (getConstantStringInfo(LHS, LHSStr) && getConstantStringInfo(RHS, RHSStr)) { // Make sure we're not reading out-of-bounds memory. if (Len > LHSStr.size() || Len > RHSStr.size()) return nullptr; // Fold the memcmp and normalize the result. This way we get consistent // results across multiple platforms. uint64_t Ret = 0; int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len); if (Cmp < 0) Ret = -1; else if (Cmp > 0) Ret = 1; return ConstantInt::get(CI->getType(), Ret); } return nullptr; } Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy)) return nullptr; // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), 1); return CI->getArgOperand(0); } Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove)) return nullptr; // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), 1); return CI->getArgOperand(0); } Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset)) return nullptr; // memset(p, v, n) -> llvm.memset(p, v, n, 1) Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); return CI->getArgOperand(0); } //===----------------------------------------------------------------------===// // Math Library Optimizations //===----------------------------------------------------------------------===// /// Return a variant of Val with float type. /// Currently this works in two cases: If Val is an FPExtension of a float /// value to something bigger, simply return the operand. /// If Val is a ConstantFP but can be converted to a float ConstantFP without /// loss of precision do so. static Value *valueHasFloatPrecision(Value *Val) { if (FPExtInst *Cast = dyn_cast(Val)) { Value *Op = Cast->getOperand(0); if (Op->getType()->isFloatTy()) return Op; } if (ConstantFP *Const = dyn_cast(Val)) { APFloat F = Const->getValueAPF(); bool losesInfo; (void)F.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &losesInfo); if (!losesInfo) return ConstantFP::get(Const->getContext(), F); } return nullptr; } -//===----------------------------------------------------------------------===// -// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' +/// Any floating-point library function that we're trying to simplify will have +/// a signature of the form: fptype foo(fptype param1, fptype param2, ...). +/// CheckDoubleTy indicates that 'fptype' must be 'double'. +static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams, + bool CheckDoubleTy) { + FunctionType *FT = F->getFunctionType(); + if (FT->getNumParams() != NumParams) + return false; -Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, - bool CheckRetType) { + // The return type must match what we're looking for. + Type *RetTy = FT->getReturnType(); + if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy()) + return false; + + // Each parameter must match the return type, and therefore, match every other + // parameter too. + for (const Type *ParamTy : FT->params()) + if (ParamTy != RetTy) + return false; + + return true; +} + +/// Shrink double -> float for unary functions like 'floor'. +static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, + bool CheckRetType) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || - !FT->getParamType(0)->isDoubleTy()) + if (!matchesFPLibFunctionSignature(Callee, 1, true)) return nullptr; if (CheckRetType) { // Check if all the uses for function like 'sin' are converted to float. for (User *U : CI->users()) { FPTruncInst *Cast = dyn_cast(U); if (!Cast || !Cast->getType()->isFloatTy()) return nullptr; } } // If this is something like 'floor((double)floatval)', convert to floorf. Value *V = valueHasFloatPrecision(CI->getArgOperand(0)); if (V == nullptr) return nullptr; // Propagate fast-math flags from the existing call to the new call. IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); // floor((double)floatval) -> (double)floorf(floatval) if (Callee->isIntrinsic()) { Module *M = CI->getModule(); Intrinsic::ID IID = Callee->getIntrinsicID(); Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); V = B.CreateCall(F, V); } else { // The call is a library call rather than an intrinsic. V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); } return B.CreateFPExt(V, B.getDoubleTy()); } -// Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax' -Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { +/// Shrink double -> float for binary functions like 'fmin/fmax'. +static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - !FT->getParamType(0)->isFloatingPointTy()) + if (!matchesFPLibFunctionSignature(Callee, 2, true)) return nullptr; // If this is something like 'fmin((double)floatval1, (double)floatval2)', // or fmin(1.0, (double)floatval), then we convert it to fminf. Value *V1 = valueHasFloatPrecision(CI->getArgOperand(0)); if (V1 == nullptr) return nullptr; Value *V2 = valueHasFloatPrecision(CI->getArgOperand(1)); if (V2 == nullptr) return nullptr; // Propagate fast-math flags from the existing call to the new call. IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); // fmin((double)floatval1, (double)floatval2) // -> (double)fminf(floatval1, floatval2) // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP(). Value *V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B, Callee->getAttributes()); return B.CreateFPExt(V, B.getDoubleTy()); } Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) return Ret; // cos(-x) -> cos(x) Value *Op1 = CI->getArgOperand(0); if (BinaryOperator::isFNeg(Op1)) { BinaryOperator *BinExpr = cast(Op1); return B.CreateCall(Callee, BinExpr->getOperand(1), "cos"); } return Ret; } static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { // Multiplications calculated using Addition Chains. // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html assert(Exp != 0 && "Incorrect exponent 0 not handled"); if (InnerChain[Exp]) return InnerChain[Exp]; static const unsigned AddChain[33][2] = { {0, 0}, // Unused. {0, 0}, // Unused (base case = pow1). {1, 1}, // Unused (pre-computed). {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4}, {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7}, {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10}, {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, }; InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), getPow(InnerChain, AddChain[Exp][1], B)); return InnerChain[Exp]; } Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the // result type. if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || !FT->getParamType(0)->isFloatingPointTy()) return Ret; Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); if (ConstantFP *Op1C = dyn_cast(Op1)) { // pow(1.0, x) -> 1.0 if (Op1C->isExactlyValue(1.0)) return Op1C; // pow(2.0, x) -> exp2(x) if (Op1C->isExactlyValue(2.0) && hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f, LibFunc::exp2l)) return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B, Callee->getAttributes()); // pow(10.0, x) -> exp10(x) if (Op1C->isExactlyValue(10.0) && hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f, LibFunc::exp10l)) return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B, Callee->getAttributes()); } // FIXME: Use instruction-level FMF. bool UnsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent()); // pow(exp(x), y) -> exp(x * y) // pow(exp2(x), y) -> exp2(x * y) // We enable these only with fast-math. Besides rounding differences, the // transformation changes overflow and underflow behavior quite dramatically. // Example: x = 1000, y = 0.001. // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1). auto *OpC = dyn_cast(Op1); if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) { LibFunc::Func Func; Function *OpCCallee = OpC->getCalledFunction(); if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) && TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) { IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul"); return EmitUnaryFloatFnCall(FMul, OpCCallee->getName(), B, OpCCallee->getAttributes()); } } ConstantFP *Op2C = dyn_cast(Op2); if (!Op2C) return Ret; if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 return ConstantFP::get(CI->getType(), 1.0); if (Op2C->isExactlyValue(0.5) && hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf, LibFunc::sqrtl) && hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf, LibFunc::fabsl)) { // In -ffast-math, pow(x, 0.5) -> sqrt(x). if (CI->hasUnsafeAlgebra()) { IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B, Callee->getAttributes()); } // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). // This is faster than calling pow, and still handles negative zero // and negative infinity correctly. // TODO: In finite-only mode, this could be just fabs(sqrt(x)). Value *Inf = ConstantFP::getInfinity(CI->getType()); Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes()); Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes()); Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); return Sel; } if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x return Op1; if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x return B.CreateFMul(Op1, Op1, "pow2"); if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip"); // In -ffast-math, generate repeated fmul instead of generating pow(x, n). if (UnsafeFPMath) { APFloat V = abs(Op2C->getValueAPF()); // We limit to a max of 7 fmul(s). Thus max exponent is 32. // This transformation applies to integer exponents only. if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan || !V.isInteger()) return nullptr; // We will memoize intermediate products of the Addition Chain. Value *InnerChain[33] = {nullptr}; InnerChain[1] = Op1; InnerChain[2] = B.CreateFMul(Op1, Op1); // We cannot readily convert a non-double type (like float) to a double. // So we first convert V to something which could be converted to double. bool ignored; V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); Value *FMul = getPow(InnerChain, V.convertToDouble(), B); // For negative exponents simply compute the reciprocal. if (Op2C->isNegative()) FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul); return FMul; } return nullptr; } Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Function *Caller = CI->getParent()->getParent(); Value *Ret = nullptr; StringRef Name = Callee->getName(); if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) return Ret; Value *Op = CI->getArgOperand(0); // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 LibFunc::Func LdExp = LibFunc::ldexpl; if (Op->getType()->isFloatTy()) LdExp = LibFunc::ldexpf; else if (Op->getType()->isDoubleTy()) LdExp = LibFunc::ldexp; if (TLI->has(LdExp)) { Value *LdExpArg = nullptr; if (SIToFPInst *OpC = dyn_cast(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); } else if (UIToFPInst *OpC = dyn_cast(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); } if (LdExpArg) { Constant *One = ConstantFP::get(CI->getContext(), APFloat(1.0f)); if (!Op->getType()->isFloatTy()) One = ConstantExpr::getFPExtend(One, Op->getType()); Module *M = Caller->getParent(); Value *Callee = M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(), Op->getType(), B.getInt32Ty(), nullptr); CallInst *CI = B.CreateCall(Callee, {One, LdExpArg}); if (const Function *F = dyn_cast(Callee->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); return CI; } } return Ret; } Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); if (Name == "fabs" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, false); FunctionType *FT = Callee->getFunctionType(); // Make sure this has 1 argument of FP type which matches the result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) return Ret; Value *Op = CI->getArgOperand(0); if (Instruction *I = dyn_cast(Op)) { // Fold fabs(x * x) -> x * x; any squared FP value must already be positive. if (I->getOpcode() == Instruction::FMul) if (I->getOperand(0) == I->getOperand(1)) return Op; } return Ret; } Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { // If we can shrink the call to a float function rather than a double // function, do that first. Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name)) if (Value *Ret = optimizeBinaryDoubleFP(CI, B)) return Ret; // Make sure this has 2 arguments of FP type which match the result type. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || !FT->getParamType(0)->isFloatingPointTy()) return nullptr; IRBuilder<>::FastMathFlagGuard Guard(B); FastMathFlags FMF; if (CI->hasUnsafeAlgebra()) { // Unsafe algebra sets all fast-math-flags to true. FMF.setUnsafeAlgebra(); } else { // At a minimum, no-nans-fp-math must be true. if (!CI->hasNoNaNs()) return nullptr; // No-signed-zeros is implied by the definitions of fmax/fmin themselves: // "Ideally, fmax would be sensitive to the sign of zero, for example // fmax(-0. 0, +0. 0) would return +0; however, implementation in software // might be impractical." FMF.setNoSignedZeros(); FMF.setNoNaNs(); } B.setFastMathFlags(FMF); // We have a relaxed floating-point environment. We can ignore NaN-handling // and transform to a compare and select. We do not have to consider errno or // exceptions, because fmin/fmax do not have those. Value *Op0 = CI->getArgOperand(0); Value *Op1 = CI->getArgOperand(1); Value *Cmp = Callee->getName().startswith("fmin") ? B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1); return B.CreateSelect(Cmp, Op0, Op1); } Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); if (UnsafeFPShrink && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) return Ret; if (!CI->hasUnsafeAlgebra()) return Ret; Value *Op1 = CI->getArgOperand(0); auto *OpC = dyn_cast(Op1); // The earlier call must also be unsafe in order to do these transforms. if (!OpC || !OpC->hasUnsafeAlgebra()) return Ret; // log(pow(x,y)) -> y*log(x) // This is only applicable to log, log2, log10. if (Name != "log" && Name != "log2" && Name != "log10") return Ret; IRBuilder<>::FastMathFlagGuard Guard(B); FastMathFlags FMF; FMF.setUnsafeAlgebra(); B.setFastMathFlags(FMF); LibFunc::Func Func; Function *F = OpC->getCalledFunction(); if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow)) return B.CreateFMul(OpC->getArgOperand(1), EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, Callee->getAttributes()), "mul"); // log(exp2(y)) -> y*log(2) if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && Func == LibFunc::exp2) return B.CreateFMul( OpC->getArgOperand(0), EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), Callee->getName(), B, Callee->getAttributes()), "logmul"); return Ret; } Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - + Value *Ret = nullptr; if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" || Callee->getIntrinsicID() == Intrinsic::sqrt)) Ret = optimizeUnaryDoubleFP(CI, B, true); + + // FIXME: Refactor - this check is repeated all over this file and even in the + // preceding call to shrink double -> float. + + // Make sure this has 1 argument of FP type, which matches the result type. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; if (!CI->hasUnsafeAlgebra()) return Ret; Instruction *I = dyn_cast(CI->getArgOperand(0)); if (!I || I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra()) return Ret; // We're looking for a repeated factor in a multiplication tree, // so we can do this fold: sqrt(x * x) -> fabs(x); // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y). Value *Op0 = I->getOperand(0); Value *Op1 = I->getOperand(1); Value *RepeatOp = nullptr; Value *OtherOp = nullptr; if (Op0 == Op1) { // Simple match: the operands of the multiply are identical. RepeatOp = Op0; } else { // Look for a more complicated pattern: one of the operands is itself // a multiply, so search for a common factor in that multiply. // Note: We don't bother looking any deeper than this first level or for // variations of this pattern because instcombine's visitFMUL and/or the // reassociation pass should give us this form. Value *OtherMul0, *OtherMul1; if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) { // Pattern: sqrt((x * y) * z) if (OtherMul0 == OtherMul1 && cast(Op0)->hasUnsafeAlgebra()) { // Matched: sqrt((x * x) * z) RepeatOp = OtherMul0; OtherOp = Op1; } } } if (!RepeatOp) return Ret; // Fast math flags for any created instructions should match the sqrt // and multiply. IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(I->getFastMathFlags()); // If we found a repeated factor, hoist it out of the square root and // replace it with the fabs of that factor. Module *M = Callee->getParent(); Type *ArgType = I->getType(); Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType); Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs"); if (OtherOp) { // If we found a non-repeated factor, we still need to get its square // root. We then multiply that by the value that was simplified out // of the square root calculation. Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType); Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt"); return B.CreateFMul(FabsCall, SqrtCall); } return FabsCall; } // TODO: Generalize to handle any trig function and its inverse. Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) return Ret; Value *Op1 = CI->getArgOperand(0); auto *OpC = dyn_cast(Op1); if (!OpC) return Ret; // Both calls must allow unsafe optimizations in order to remove them. if (!CI->hasUnsafeAlgebra() || !OpC->hasUnsafeAlgebra()) return Ret; // tan(atan(x)) -> x // tanf(atanf(x)) -> x // tanl(atanl(x)) -> x LibFunc::Func Func; Function *F = OpC->getCalledFunction(); if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && ((Func == LibFunc::atan && Callee->getName() == "tan") || (Func == LibFunc::atanf && Callee->getName() == "tanf") || (Func == LibFunc::atanl && Callee->getName() == "tanl"))) Ret = OpC->getArgOperand(0); return Ret; } static bool isTrigLibCall(CallInst *CI); static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, bool UseFloat, Value *&Sin, Value *&Cos, Value *&SinCos); Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) { // Make sure the prototype is as expected, otherwise the rest of the // function is probably invalid and likely to abort. if (!isTrigLibCall(CI)) return nullptr; Value *Arg = CI->getArgOperand(0); SmallVector SinCalls; SmallVector CosCalls; SmallVector SinCosCalls; bool IsFloat = Arg->getType()->isFloatTy(); // Look for all compatible sinpi, cospi and sincospi calls with the same // argument. If there are enough (in some sense) we can make the // substitution. for (User *U : Arg->users()) classifyArgUse(U, CI->getParent(), IsFloat, SinCalls, CosCalls, SinCosCalls); // It's only worthwhile if both sinpi and cospi are actually used. if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty())) return nullptr; Value *Sin, *Cos, *SinCos; insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos); replaceTrigInsts(SinCalls, Sin); replaceTrigInsts(CosCalls, Cos); replaceTrigInsts(SinCosCalls, SinCos); return nullptr; } static bool isTrigLibCall(CallInst *CI) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); // We can only hope to do anything useful if we can ignore things like errno // and floating-point exceptions. bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) && CI->hasFnAttr(Attribute::ReadNone); // Other than that we need float(float) or double(double) return AttributesSafe && FT->getNumParams() == 1 && FT->getReturnType() == FT->getParamType(0) && (FT->getParamType(0)->isFloatTy() || FT->getParamType(0)->isDoubleTy()); } void LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, SmallVectorImpl &SinCalls, SmallVectorImpl &CosCalls, SmallVectorImpl &SinCosCalls) { CallInst *CI = dyn_cast(Val); if (!CI) return; Function *Callee = CI->getCalledFunction(); LibFunc::Func Func; if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) || !isTrigLibCall(CI)) return; if (IsFloat) { if (Func == LibFunc::sinpif) SinCalls.push_back(CI); else if (Func == LibFunc::cospif) CosCalls.push_back(CI); else if (Func == LibFunc::sincospif_stret) SinCosCalls.push_back(CI); } else { if (Func == LibFunc::sinpi) SinCalls.push_back(CI); else if (Func == LibFunc::cospi) CosCalls.push_back(CI); else if (Func == LibFunc::sincospi_stret) SinCosCalls.push_back(CI); } } void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl &Calls, Value *Res) { for (CallInst *C : Calls) replaceAllUsesWith(C, Res); } void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, bool UseFloat, Value *&Sin, Value *&Cos, Value *&SinCos) { Type *ArgTy = Arg->getType(); Type *ResTy; StringRef Name; Triple T(OrigCallee->getParent()->getTargetTriple()); if (UseFloat) { Name = "__sincospif_stret"; assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now"); // x86_64 can't use {float, float} since that would be returned in both // xmm0 and xmm1, which isn't what a real struct would do. ResTy = T.getArch() == Triple::x86_64 ? static_cast(VectorType::get(ArgTy, 2)) : static_cast(StructType::get(ArgTy, ArgTy, nullptr)); } else { Name = "__sincospi_stret"; ResTy = StructType::get(ArgTy, ArgTy, nullptr); } Module *M = OrigCallee->getParent(); Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy, nullptr); if (Instruction *ArgInst = dyn_cast(Arg)) { // If the argument is an instruction, it must dominate all uses so put our // sincos call there. B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); } else { // Otherwise (e.g. for a constant) the beginning of the function is as // good a place as any. BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock(); B.SetInsertPoint(&EntryBB, EntryBB.begin()); } SinCos = B.CreateCall(Callee, Arg, "sincospi"); if (SinCos->getType()->isStructTy()) { Sin = B.CreateExtractValue(SinCos, 0, "sinpi"); Cos = B.CreateExtractValue(SinCos, 1, "cospi"); } else { Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0), "sinpi"); Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1), "cospi"); } } //===----------------------------------------------------------------------===// // Integer Library Call Optimizations //===----------------------------------------------------------------------===// static bool checkIntUnaryReturnAndParam(Function *Callee) { FunctionType *FT = Callee->getFunctionType(); return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) && FT->getParamType(0)->isIntegerTy(); } Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkIntUnaryReturnAndParam(Callee)) return nullptr; Value *Op = CI->getArgOperand(0); // Constant fold. if (ConstantInt *CI = dyn_cast(Op)) { if (CI->isZero()) // ffs(0) -> 0. return B.getInt32(0); // ffs(c) -> cttz(c)+1 return B.getInt32(CI->getValue().countTrailingZeros() + 1); } // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 Type *ArgType = Op->getType(); Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz"); V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); V = B.CreateIntCast(V, B.getInt32Ty(), false); Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); return B.CreateSelect(Cond, V, B.getInt32(0)); } Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); // We require integer(integer) where the types agree. if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || FT->getParamType(0) != FT->getReturnType()) return nullptr; // abs(x) -> x >s -1 ? x : -x Value *Op = CI->getArgOperand(0); Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos"); Value *Neg = B.CreateNeg(Op, "neg"); return B.CreateSelect(Pos, Op, Neg); } Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // isdigit(c) -> (c-'0') getArgOperand(0); Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); return B.CreateZExt(Op, CI->getType()); } Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // isascii(c) -> c getArgOperand(0); Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); return B.CreateZExt(Op, CI->getType()); } Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) { if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // toascii(c) -> c & 0x7f return B.CreateAnd(CI->getArgOperand(0), ConstantInt::get(CI->getType(), 0x7F)); } //===----------------------------------------------------------------------===// // Formatting and IO Library Call Optimizations //===----------------------------------------------------------------------===// static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg); Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B, int StreamArg) { // Error reporting calls should be cold, mark them as such. // This applies even to non-builtin calls: it is only a hint and applies to // functions that the frontend might not understand as builtins. // This heuristic was suggested in: // Improving Static Branch Prediction in a Compiler // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu // Proceedings of PACT'98, Oct. 1998, IEEE Function *Callee = CI->getCalledFunction(); if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI, StreamArg)) { CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold); } return nullptr; } static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { if (!ColdErrorCalls || !Callee || !Callee->isDeclaration()) return false; if (StreamArg < 0) return true; // These functions might be considered cold, but only if their stream // argument is stderr. if (StreamArg >= (int)CI->getNumArgOperands()) return false; LoadInst *LI = dyn_cast(CI->getArgOperand(StreamArg)); if (!LI) return false; GlobalVariable *GV = dyn_cast(LI->getPointerOperand()); if (!GV || !GV->isDeclaration()) return false; return GV->getName() == "stderr"; } Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { // Check for a fixed format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) return nullptr; // Empty format string -> noop. if (FormatStr.empty()) // Tolerate printf's declared void. return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0); // Do not do any of the following transformations if the printf return value // is used, in general the printf return value is not compatible with either // putchar() or puts(). if (!CI->use_empty()) return nullptr; // printf("x") -> putchar('x'), even for '%'. if (FormatStr.size() == 1) { Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } // printf("foo\n") --> puts("foo") if (FormatStr[FormatStr.size() - 1] == '\n' && FormatStr.find('%') == StringRef::npos) { // No format characters. // Create a string literal with no \n on it. We expect the constant merge // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); Value *GV = B.CreateGlobalString(FormatStr, "str"); Value *NewCI = EmitPutS(GV, B, TLI); return (CI->use_empty() || !NewCI) ? NewCI : ConstantInt::get(CI->getType(), FormatStr.size() + 1); } // Optimize specific format strings. // printf("%c", chr) --> putchar(chr) if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isIntegerTy()) { Value *Res = EmitPutChar(CI->getArgOperand(1), B, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isPointerTy()) { return EmitPutS(CI->getArgOperand(1), B, TLI); } return nullptr; } Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Require one fixed pointer argument and an integer/void result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy())) return nullptr; if (Value *V = optimizePrintFString(CI, B)) { return V; } // printf(format, ...) -> iprintf(format, ...) if no floating point // arguments. if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) { Module *M = B.GetInsertBlock()->getParent()->getParent(); Constant *IPrintFFn = M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); CallInst *New = cast(CI->clone()); New->setCalledFunction(IPrintFFn); B.Insert(New); return New; } return nullptr; } Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { // Check for a fixed format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) return nullptr; // If we just have a format string (nothing else crazy) transform it. if (CI->getNumArgOperands() == 2) { // Make sure there's no % in the constant array. We could try to handle // %% -> % in the future if we cared. for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) if (FormatStr[i] == '%') return nullptr; // we found a format specifier, bail out. // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size() + 1), 1); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } // The remaining optimizations require the format string to be "%s" or "%c" // and have an extra operand. if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumArgOperands() < 3) return nullptr; // Decode the second character of the format string. if (FormatStr[1] == 'c') { // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); Value *Ptr = CastToCStr(CI->getArgOperand(0), B); B.CreateStore(V, Ptr); Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul"); B.CreateStore(B.getInt8(0), Ptr); return ConstantInt::get(CI->getType(), 1); } if (FormatStr[1] == 's') { // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI); if (!Len) return nullptr; Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); // The sprintf result is the unincremented number of bytes in the string. return B.CreateIntCast(Len, CI->getType(), false); } return nullptr; } Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Require two fixed pointer arguments and an integer result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) return nullptr; if (Value *V = optimizeSPrintFString(CI, B)) { return V; } // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating // point arguments. if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) { Module *M = B.GetInsertBlock()->getParent()->getParent(); Constant *SIPrintFFn = M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); CallInst *New = cast(CI->clone()); New->setCalledFunction(SIPrintFFn); B.Insert(New); return New; } return nullptr; } Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) { optimizeErrorReporting(CI, B, 0); // All the optimizations depend on the format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) return nullptr; // Do not do any of the following transformations if the fprintf return // value is used, in general the fprintf return value is not compatible // with fwrite(), fputc() or fputs(). if (!CI->use_empty()) return nullptr; // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) if (CI->getNumArgOperands() == 2) { for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) if (FormatStr[i] == '%') // Could handle %% -> % if we cared. return nullptr; // We found a format specifier. return EmitFWrite( CI->getArgOperand(1), ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()), CI->getArgOperand(0), B, DL, TLI); } // The remaining optimizations require the format string to be "%s" or "%c" // and have an extra operand. if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumArgOperands() < 3) return nullptr; // Decode the second character of the format string. if (FormatStr[1] == 'c') { // fprintf(F, "%c", chr) --> fputc(chr, F) if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); } if (FormatStr[1] == 's') { // fprintf(F, "%s", str) --> fputs(str, F) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); } return nullptr; } Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Require two fixed paramters as pointers and integer result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) return nullptr; if (Value *V = optimizeFPrintFString(CI, B)) { return V; } // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no // floating point arguments. if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) { Module *M = B.GetInsertBlock()->getParent()->getParent(); Constant *FIPrintFFn = M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); CallInst *New = cast(CI->clone()); New->setCalledFunction(FIPrintFFn); B.Insert(New); return New; } return nullptr; } Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) { optimizeErrorReporting(CI, B, 3); Function *Callee = CI->getCalledFunction(); // Require a pointer, an integer, an integer, a pointer, returning integer. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || !FT->getParamType(2)->isIntegerTy() || !FT->getParamType(3)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) return nullptr; // Get the element size and count. ConstantInt *SizeC = dyn_cast(CI->getArgOperand(1)); ConstantInt *CountC = dyn_cast(CI->getArgOperand(2)); if (!SizeC || !CountC) return nullptr; uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue(); // If this is writing zero records, remove the call (it's a noop). if (Bytes == 0) return ConstantInt::get(CI->getType(), 0); // If this is writing one byte, turn it into fputc. // This optimisation is only valid, if the return value is unused. if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TLI); return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr; } return nullptr; } Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { optimizeErrorReporting(CI, B, 1); Function *Callee = CI->getCalledFunction(); // Require two pointers. Also, we can't optimize if return value is used. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !CI->use_empty()) return nullptr; // fputs(s,F) --> fwrite(s,1,strlen(s),F) uint64_t Len = GetStringLength(CI->getArgOperand(0)); if (!Len) return nullptr; // Known to have no uses (see above). return EmitFWrite( CI->getArgOperand(0), ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), CI->getArgOperand(1), B, DL, TLI); } Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Require one fixed pointer argument and an integer/void result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy())) return nullptr; // Check for a constant string. StringRef Str; if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; if (Str.empty() && CI->use_empty()) { // puts("") -> putchar('\n') Value *Res = EmitPutChar(B.getInt32('\n'), B, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } return nullptr; } bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { LibFunc::Func Func; SmallString<20> FloatFuncName = FuncName; FloatFuncName += 'f'; if (TLI->getLibFunc(FloatFuncName, Func)) return TLI->has(Func); return false; } Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &Builder) { LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); StringRef FuncName = Callee->getName(); // Check for string/memory library functions. if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) { // Make sure we never change the calling convention. assert((ignoreCallingConv(Func) || CI->getCallingConv() == llvm::CallingConv::C) && "Optimizing string/memory libcall would change the calling convention"); switch (Func) { case LibFunc::strcat: return optimizeStrCat(CI, Builder); case LibFunc::strncat: return optimizeStrNCat(CI, Builder); case LibFunc::strchr: return optimizeStrChr(CI, Builder); case LibFunc::strrchr: return optimizeStrRChr(CI, Builder); case LibFunc::strcmp: return optimizeStrCmp(CI, Builder); case LibFunc::strncmp: return optimizeStrNCmp(CI, Builder); case LibFunc::strcpy: return optimizeStrCpy(CI, Builder); case LibFunc::stpcpy: return optimizeStpCpy(CI, Builder); case LibFunc::strncpy: return optimizeStrNCpy(CI, Builder); case LibFunc::strlen: return optimizeStrLen(CI, Builder); case LibFunc::strpbrk: return optimizeStrPBrk(CI, Builder); case LibFunc::strtol: case LibFunc::strtod: case LibFunc::strtof: case LibFunc::strtoul: case LibFunc::strtoll: case LibFunc::strtold: case LibFunc::strtoull: return optimizeStrTo(CI, Builder); case LibFunc::strspn: return optimizeStrSpn(CI, Builder); case LibFunc::strcspn: return optimizeStrCSpn(CI, Builder); case LibFunc::strstr: return optimizeStrStr(CI, Builder); case LibFunc::memchr: return optimizeMemChr(CI, Builder); case LibFunc::memcmp: return optimizeMemCmp(CI, Builder); case LibFunc::memcpy: return optimizeMemCpy(CI, Builder); case LibFunc::memmove: return optimizeMemMove(CI, Builder); case LibFunc::memset: return optimizeMemSet(CI, Builder); default: break; } } return nullptr; } Value *LibCallSimplifier::optimizeCall(CallInst *CI) { if (CI->isNoBuiltin()) return nullptr; LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); StringRef FuncName = Callee->getName(); SmallVector OpBundles; CI->getOperandBundlesAsDefs(OpBundles); IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles); bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; // Command-line parameter overrides function attribute. if (EnableUnsafeFPShrink.getNumOccurrences() > 0) UnsafeFPShrink = EnableUnsafeFPShrink; else if (canUseUnsafeFPMath(Callee)) UnsafeFPShrink = true; // First, check for intrinsics. if (IntrinsicInst *II = dyn_cast(CI)) { if (!isCallingConvC) return nullptr; switch (II->getIntrinsicID()) { case Intrinsic::pow: return optimizePow(CI, Builder); case Intrinsic::exp2: return optimizeExp2(CI, Builder); case Intrinsic::fabs: return optimizeFabs(CI, Builder); case Intrinsic::log: return optimizeLog(CI, Builder); case Intrinsic::sqrt: return optimizeSqrt(CI, Builder); default: return nullptr; } } // Also try to simplify calls to fortified library functions. if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) { // Try to further simplify the result. CallInst *SimplifiedCI = dyn_cast(SimplifiedFortifiedCI); if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { // Use an IR Builder from SimplifiedCI if available instead of CI // to guarantee we reach all uses we might replace later on. IRBuilder<> TmpBuilder(SimplifiedCI); if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) { // If we were able to further simplify, remove the now redundant call. SimplifiedCI->replaceAllUsesWith(V); SimplifiedCI->eraseFromParent(); return V; } } return SimplifiedFortifiedCI; } // Then check for known library functions. if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) { // We never change the calling convention. if (!ignoreCallingConv(Func) && !isCallingConvC) return nullptr; if (Value *V = optimizeStringMemoryLibCall(CI, Builder)) return V; switch (Func) { case LibFunc::cosf: case LibFunc::cos: case LibFunc::cosl: return optimizeCos(CI, Builder); case LibFunc::sinpif: case LibFunc::sinpi: case LibFunc::cospif: case LibFunc::cospi: return optimizeSinCosPi(CI, Builder); case LibFunc::powf: case LibFunc::pow: case LibFunc::powl: return optimizePow(CI, Builder); case LibFunc::exp2l: case LibFunc::exp2: case LibFunc::exp2f: return optimizeExp2(CI, Builder); case LibFunc::fabsf: case LibFunc::fabs: case LibFunc::fabsl: return optimizeFabs(CI, Builder); case LibFunc::sqrtf: case LibFunc::sqrt: case LibFunc::sqrtl: return optimizeSqrt(CI, Builder); case LibFunc::ffs: case LibFunc::ffsl: case LibFunc::ffsll: return optimizeFFS(CI, Builder); case LibFunc::abs: case LibFunc::labs: case LibFunc::llabs: return optimizeAbs(CI, Builder); case LibFunc::isdigit: return optimizeIsDigit(CI, Builder); case LibFunc::isascii: return optimizeIsAscii(CI, Builder); case LibFunc::toascii: return optimizeToAscii(CI, Builder); case LibFunc::printf: return optimizePrintF(CI, Builder); case LibFunc::sprintf: return optimizeSPrintF(CI, Builder); case LibFunc::fprintf: return optimizeFPrintF(CI, Builder); case LibFunc::fwrite: return optimizeFWrite(CI, Builder); case LibFunc::fputs: return optimizeFPuts(CI, Builder); case LibFunc::log: case LibFunc::log10: case LibFunc::log1p: case LibFunc::log2: case LibFunc::logb: return optimizeLog(CI, Builder); case LibFunc::puts: return optimizePuts(CI, Builder); case LibFunc::tan: case LibFunc::tanf: case LibFunc::tanl: return optimizeTan(CI, Builder); case LibFunc::perror: return optimizeErrorReporting(CI, Builder); case LibFunc::vfprintf: case LibFunc::fiprintf: return optimizeErrorReporting(CI, Builder, 0); case LibFunc::fputc: return optimizeErrorReporting(CI, Builder, 1); case LibFunc::ceil: case LibFunc::floor: case LibFunc::rint: case LibFunc::round: case LibFunc::nearbyint: case LibFunc::trunc: if (hasFloatVersion(FuncName)) return optimizeUnaryDoubleFP(CI, Builder, false); return nullptr; case LibFunc::acos: case LibFunc::acosh: case LibFunc::asin: case LibFunc::asinh: case LibFunc::atan: case LibFunc::atanh: case LibFunc::cbrt: case LibFunc::cosh: case LibFunc::exp: case LibFunc::exp10: case LibFunc::expm1: case LibFunc::sin: case LibFunc::sinh: case LibFunc::tanh: if (UnsafeFPShrink && hasFloatVersion(FuncName)) return optimizeUnaryDoubleFP(CI, Builder, true); return nullptr; case LibFunc::copysign: if (hasFloatVersion(FuncName)) return optimizeBinaryDoubleFP(CI, Builder); return nullptr; case LibFunc::fminf: case LibFunc::fmin: case LibFunc::fminl: case LibFunc::fmaxf: case LibFunc::fmax: case LibFunc::fmaxl: return optimizeFMinFMax(CI, Builder); default: return nullptr; } } return nullptr; } LibCallSimplifier::LibCallSimplifier( const DataLayout &DL, const TargetLibraryInfo *TLI, function_ref Replacer) : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false), Replacer(Replacer) {} void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { // Indirect through the replacer used in this instance. Replacer(I, With); } // TODO: // Additional cases that we need to add to this file: // // cbrt: // * cbrt(expN(X)) -> expN(x/3) // * cbrt(sqrt(x)) -> pow(x,1/6) // * cbrt(cbrt(x)) -> pow(x,1/9) // // exp, expf, expl: // * exp(log(x)) -> x // // log, logf, logl: // * log(exp(x)) -> x // * log(exp(y)) -> y*log(e) // * log(exp10(y)) -> y*log(10) // * log(sqrt(x)) -> 0.5*log(x) // // lround, lroundf, lroundl: // * lround(cnst) -> cnst' // // pow, powf, powl: // * pow(sqrt(x),y) -> pow(x,y*0.5) // * pow(pow(x,y),z)-> pow(x,y*z) // // round, roundf, roundl: // * round(cnst) -> cnst' // // signbit: // * signbit(cnst) -> cnst' // * signbit(nncst) -> 0 (if pstv is a non-negative constant) // // sqrt, sqrtf, sqrtl: // * sqrt(expN(x)) -> expN(x*0.5) // * sqrt(Nroot(x)) -> pow(x,1/(2*N)) // * sqrt(pow(x,y)) -> pow(|x|,y*0.5) // // trunc, truncf, truncl: // * trunc(cnst) -> cnst' // // //===----------------------------------------------------------------------===// // Fortified Library Call Optimizations //===----------------------------------------------------------------------===// bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, unsigned ObjSizeOp, unsigned SizeOp, bool isString) { if (CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(SizeOp)) return true; if (ConstantInt *ObjSizeCI = dyn_cast(CI->getArgOperand(ObjSizeOp))) { if (ObjSizeCI->isAllOnesValue()) return true; // If the object size wasn't -1 (unknown), bail out if we were asked to. if (OnlyLowerUnknownSize) return false; if (isString) { uint64_t Len = GetStringLength(CI->getArgOperand(SizeOp)); // If the length is 0 we don't know how long it is and so we can't // remove the check. if (Len == 0) return false; return ObjSizeCI->getZExtValue() >= Len; } if (ConstantInt *SizeCI = dyn_cast(CI->getArgOperand(SizeOp))) return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue(); } return false; } Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), 1); return CI->getArgOperand(0); } return nullptr; } Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), 1); return CI->getArgOperand(0); } return nullptr; } Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); return CI->getArgOperand(0); } return nullptr; } Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func) { Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); const DataLayout &DL = CI->getModule()->getDataLayout(); if (!checkStringCopyLibFuncSignature(Callee, Func)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1), *ObjSize = CI->getArgOperand(2); // __stpcpy_chk(x,x,...) -> x+strlen(x) if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) { Value *StrLen = EmitStrLen(Src, B, DL, TLI); return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr; } // If a) we don't have any length information, or b) we know this will // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our // st[rp]cpy_chk call which may fail at runtime if the size is too long. // TODO: It might be nice to get a maximum length out of the possible // string lengths for varying. if (isFortifiedCallFoldable(CI, 2, 1, true)) return EmitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6)); if (OnlyLowerUnknownSize) return nullptr; // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk. uint64_t Len = GetStringLength(Src); if (Len == 0) return nullptr; Type *SizeTTy = DL.getIntPtrType(CI->getContext()); Value *LenV = ConstantInt::get(SizeTTy, Len); Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI); // If the function was an __stpcpy_chk, and we were able to fold it into // a __memcpy_chk, we still need to return the correct end pointer. if (Ret && Func == LibFunc::stpcpy_chk) return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1)); return Ret; } Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func) { Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); if (!checkStringCopyLibFuncSignature(Callee, Func)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, TLI, Name.substr(2, 7)); return Ret; } return nullptr; } Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here. // Some clang users checked for _chk libcall availability using: // __has_builtin(__builtin___memcpy_chk) // When compiling with -fno-builtin, this is always true. // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we // end up with fortified libcalls, which isn't acceptable in a freestanding // environment which only provides their non-fortified counterparts. // // Until we change clang and/or teach external users to check for availability // differently, disregard the "nobuiltin" attribute and TLI::has. // // PR23093. LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); StringRef FuncName = Callee->getName(); SmallVector OpBundles; CI->getOperandBundlesAsDefs(OpBundles); IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles); bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; // First, check that this is a known library functions. if (!TLI->getLibFunc(FuncName, Func)) return nullptr; // We never change the calling convention. if (!ignoreCallingConv(Func) && !isCallingConvC) return nullptr; switch (Func) { case LibFunc::memcpy_chk: return optimizeMemCpyChk(CI, Builder); case LibFunc::memmove_chk: return optimizeMemMoveChk(CI, Builder); case LibFunc::memset_chk: return optimizeMemSetChk(CI, Builder); case LibFunc::stpcpy_chk: case LibFunc::strcpy_chk: return optimizeStrpCpyChk(CI, Builder, Func); case LibFunc::stpncpy_chk: case LibFunc::strncpy_chk: return optimizeStrpNCpyChk(CI, Builder, Func); default: break; } return nullptr; } FortifiedLibCallSimplifier::FortifiedLibCallSimplifier( const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize) : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {} Index: projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 294609) @@ -1,4258 +1,4260 @@ //===----- CGOpenMPRuntime.cpp - Interface to OpenMP Runtimes -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This provides a class for OpenMP runtime code generation. // //===----------------------------------------------------------------------===// #include "CGCXXABI.h" #include "CGCleanup.h" #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "clang/AST/Decl.h" #include "clang/AST/StmtOpenMP.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Value.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include using namespace clang; using namespace CodeGen; namespace { /// \brief Base class for handling code generation inside OpenMP regions. class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo { public: /// \brief Kinds of OpenMP regions used in codegen. enum CGOpenMPRegionKind { /// \brief Region with outlined function for standalone 'parallel' /// directive. ParallelOutlinedRegion, /// \brief Region with outlined function for standalone 'task' directive. TaskOutlinedRegion, /// \brief Region for constructs that do not require function outlining, /// like 'for', 'sections', 'atomic' etc. directives. InlinedRegion, /// \brief Region with outlined function for standalone 'target' directive. TargetRegion, }; CGOpenMPRegionInfo(const CapturedStmt &CS, const CGOpenMPRegionKind RegionKind, const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind, bool HasCancel) : CGCapturedStmtInfo(CS, CR_OpenMP), RegionKind(RegionKind), CodeGen(CodeGen), Kind(Kind), HasCancel(HasCancel) {} CGOpenMPRegionInfo(const CGOpenMPRegionKind RegionKind, const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind, bool HasCancel) : CGCapturedStmtInfo(CR_OpenMP), RegionKind(RegionKind), CodeGen(CodeGen), Kind(Kind), HasCancel(HasCancel) {} /// \brief Get a variable or parameter for storing global thread id /// inside OpenMP construct. virtual const VarDecl *getThreadIDVariable() const = 0; /// \brief Emit the captured statement body. void EmitBody(CodeGenFunction &CGF, const Stmt *S) override; /// \brief Get an LValue for the current ThreadID variable. /// \return LValue for thread id variable. This LValue always has type int32*. virtual LValue getThreadIDVariableLValue(CodeGenFunction &CGF); CGOpenMPRegionKind getRegionKind() const { return RegionKind; } OpenMPDirectiveKind getDirectiveKind() const { return Kind; } bool hasCancel() const { return HasCancel; } static bool classof(const CGCapturedStmtInfo *Info) { return Info->getKind() == CR_OpenMP; } protected: CGOpenMPRegionKind RegionKind; RegionCodeGenTy CodeGen; OpenMPDirectiveKind Kind; bool HasCancel; }; /// \brief API for captured statement code generation in OpenMP constructs. class CGOpenMPOutlinedRegionInfo : public CGOpenMPRegionInfo { public: CGOpenMPOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar, const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind, bool HasCancel) : CGOpenMPRegionInfo(CS, ParallelOutlinedRegion, CodeGen, Kind, HasCancel), ThreadIDVar(ThreadIDVar) { assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region."); } /// \brief Get a variable or parameter for storing global thread id /// inside OpenMP construct. const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; } /// \brief Get the name of the capture helper. StringRef getHelperName() const override { return ".omp_outlined."; } static bool classof(const CGCapturedStmtInfo *Info) { return CGOpenMPRegionInfo::classof(Info) && cast(Info)->getRegionKind() == ParallelOutlinedRegion; } private: /// \brief A variable or parameter storing global thread id for OpenMP /// constructs. const VarDecl *ThreadIDVar; }; /// \brief API for captured statement code generation in OpenMP constructs. class CGOpenMPTaskOutlinedRegionInfo : public CGOpenMPRegionInfo { public: CGOpenMPTaskOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar, const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind, bool HasCancel) : CGOpenMPRegionInfo(CS, TaskOutlinedRegion, CodeGen, Kind, HasCancel), ThreadIDVar(ThreadIDVar) { assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region."); } /// \brief Get a variable or parameter for storing global thread id /// inside OpenMP construct. const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; } /// \brief Get an LValue for the current ThreadID variable. LValue getThreadIDVariableLValue(CodeGenFunction &CGF) override; /// \brief Get the name of the capture helper. StringRef getHelperName() const override { return ".omp_outlined."; } static bool classof(const CGCapturedStmtInfo *Info) { return CGOpenMPRegionInfo::classof(Info) && cast(Info)->getRegionKind() == TaskOutlinedRegion; } private: /// \brief A variable or parameter storing global thread id for OpenMP /// constructs. const VarDecl *ThreadIDVar; }; /// \brief API for inlined captured statement code generation in OpenMP /// constructs. class CGOpenMPInlinedRegionInfo : public CGOpenMPRegionInfo { public: CGOpenMPInlinedRegionInfo(CodeGenFunction::CGCapturedStmtInfo *OldCSI, const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind, bool HasCancel) : CGOpenMPRegionInfo(InlinedRegion, CodeGen, Kind, HasCancel), OldCSI(OldCSI), OuterRegionInfo(dyn_cast_or_null(OldCSI)) {} // \brief Retrieve the value of the context parameter. llvm::Value *getContextValue() const override { if (OuterRegionInfo) return OuterRegionInfo->getContextValue(); llvm_unreachable("No context value for inlined OpenMP region"); } void setContextValue(llvm::Value *V) override { if (OuterRegionInfo) { OuterRegionInfo->setContextValue(V); return; } llvm_unreachable("No context value for inlined OpenMP region"); } /// \brief Lookup the captured field decl for a variable. const FieldDecl *lookup(const VarDecl *VD) const override { if (OuterRegionInfo) return OuterRegionInfo->lookup(VD); // If there is no outer outlined region,no need to lookup in a list of // captured variables, we can use the original one. return nullptr; } FieldDecl *getThisFieldDecl() const override { if (OuterRegionInfo) return OuterRegionInfo->getThisFieldDecl(); return nullptr; } /// \brief Get a variable or parameter for storing global thread id /// inside OpenMP construct. const VarDecl *getThreadIDVariable() const override { if (OuterRegionInfo) return OuterRegionInfo->getThreadIDVariable(); return nullptr; } /// \brief Get the name of the capture helper. StringRef getHelperName() const override { if (auto *OuterRegionInfo = getOldCSI()) return OuterRegionInfo->getHelperName(); llvm_unreachable("No helper name for inlined OpenMP construct"); } CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; } static bool classof(const CGCapturedStmtInfo *Info) { return CGOpenMPRegionInfo::classof(Info) && cast(Info)->getRegionKind() == InlinedRegion; } private: /// \brief CodeGen info about outer OpenMP region. CodeGenFunction::CGCapturedStmtInfo *OldCSI; CGOpenMPRegionInfo *OuterRegionInfo; }; /// \brief API for captured statement code generation in OpenMP target /// constructs. For this captures, implicit parameters are used instead of the /// captured fields. The name of the target region has to be unique in a given /// application so it is provided by the client, because only the client has /// the information to generate that. class CGOpenMPTargetRegionInfo : public CGOpenMPRegionInfo { public: CGOpenMPTargetRegionInfo(const CapturedStmt &CS, const RegionCodeGenTy &CodeGen, StringRef HelperName) : CGOpenMPRegionInfo(CS, TargetRegion, CodeGen, OMPD_target, /*HasCancel=*/false), HelperName(HelperName) {} /// \brief This is unused for target regions because each starts executing /// with a single thread. const VarDecl *getThreadIDVariable() const override { return nullptr; } /// \brief Get the name of the capture helper. StringRef getHelperName() const override { return HelperName; } static bool classof(const CGCapturedStmtInfo *Info) { return CGOpenMPRegionInfo::classof(Info) && cast(Info)->getRegionKind() == TargetRegion; } private: StringRef HelperName; }; /// \brief RAII for emitting code of OpenMP constructs. class InlinedOpenMPRegionRAII { CodeGenFunction &CGF; public: /// \brief Constructs region for combined constructs. /// \param CodeGen Code generation sequence for combined directives. Includes /// a list of functions used for code generation of implicitly inlined /// regions. InlinedOpenMPRegionRAII(CodeGenFunction &CGF, const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind, bool HasCancel) : CGF(CGF) { // Start emission for the construct. CGF.CapturedStmtInfo = new CGOpenMPInlinedRegionInfo( CGF.CapturedStmtInfo, CodeGen, Kind, HasCancel); } ~InlinedOpenMPRegionRAII() { // Restore original CapturedStmtInfo only if we're done with code emission. auto *OldCSI = cast(CGF.CapturedStmtInfo)->getOldCSI(); delete CGF.CapturedStmtInfo; CGF.CapturedStmtInfo = OldCSI; } }; } // anonymous namespace static LValue emitLoadOfPointerLValue(CodeGenFunction &CGF, Address PtrAddr, QualType Ty) { AlignmentSource Source; CharUnits Align = CGF.getNaturalPointeeTypeAlignment(Ty, &Source); return CGF.MakeAddrLValue(Address(CGF.Builder.CreateLoad(PtrAddr), Align), Ty->getPointeeType(), Source); } LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) { return emitLoadOfPointerLValue(CGF, CGF.GetAddrOfLocalVar(getThreadIDVariable()), getThreadIDVariable()->getType()); } void CGOpenMPRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt * /*S*/) { if (!CGF.HaveInsertPoint()) return; // 1.2.2 OpenMP Language Terminology // Structured block - An executable statement with a single entry at the // top and a single exit at the bottom. // The point of exit cannot be a branch out of the structured block. // longjmp() and throw() must not violate the entry/exit criteria. CGF.EHStack.pushTerminate(); { CodeGenFunction::RunCleanupsScope Scope(CGF); CodeGen(CGF); } CGF.EHStack.popTerminate(); } LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue( CodeGenFunction &CGF) { return CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(getThreadIDVariable()), getThreadIDVariable()->getType(), AlignmentSource::Decl); } CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) : CGM(CGM), DefaultOpenMPPSource(nullptr), KmpRoutineEntryPtrTy(nullptr), OffloadEntriesInfoManager(CGM) { IdentTy = llvm::StructType::create( "ident_t", CGM.Int32Ty /* reserved_1 */, CGM.Int32Ty /* flags */, CGM.Int32Ty /* reserved_2 */, CGM.Int32Ty /* reserved_3 */, CGM.Int8PtrTy /* psource */, nullptr); // Build void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...) llvm::Type *MicroParams[] = {llvm::PointerType::getUnqual(CGM.Int32Ty), llvm::PointerType::getUnqual(CGM.Int32Ty)}; Kmpc_MicroTy = llvm::FunctionType::get(CGM.VoidTy, MicroParams, true); KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8); loadOffloadInfoMetadata(); } void CGOpenMPRuntime::clear() { InternalVars.clear(); } // Layout information for ident_t. static CharUnits getIdentAlign(CodeGenModule &CGM) { return CGM.getPointerAlign(); } static CharUnits getIdentSize(CodeGenModule &CGM) { assert((4 * CGM.getPointerSize()).isMultipleOf(CGM.getPointerAlign())); return CharUnits::fromQuantity(16) + CGM.getPointerSize(); } static CharUnits getOffsetOfIdentField(CGOpenMPRuntime::IdentFieldIndex Field) { // All the fields except the last are i32, so this works beautifully. return unsigned(Field) * CharUnits::fromQuantity(4); } static Address createIdentFieldGEP(CodeGenFunction &CGF, Address Addr, CGOpenMPRuntime::IdentFieldIndex Field, const llvm::Twine &Name = "") { auto Offset = getOffsetOfIdentField(Field); return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name); } llvm::Value *CGOpenMPRuntime::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { assert(ThreadIDVar->getType()->isPointerType() && "thread id variable must be of type kmp_int32 *"); const CapturedStmt *CS = cast(D.getAssociatedStmt()); CodeGenFunction CGF(CGM, true); bool HasCancel = false; if (auto *OPD = dyn_cast(&D)) HasCancel = OPD->hasCancel(); else if (auto *OPSD = dyn_cast(&D)) HasCancel = OPSD->hasCancel(); else if (auto *OPFD = dyn_cast(&D)) HasCancel = OPFD->hasCancel(); CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind, HasCancel); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); return CGF.GenerateOpenMPCapturedStmtFunction(*CS); } llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { assert(!ThreadIDVar->getType()->isPointerType() && "thread id variable must be of type kmp_int32 for tasks"); auto *CS = cast(D.getAssociatedStmt()); CodeGenFunction CGF(CGM, true); CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind, cast(D).hasCancel()); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); return CGF.GenerateCapturedStmtFunction(*CS); } Address CGOpenMPRuntime::getOrCreateDefaultLocation(OpenMPLocationFlags Flags) { CharUnits Align = getIdentAlign(CGM); llvm::Value *Entry = OpenMPDefaultLocMap.lookup(Flags); if (!Entry) { if (!DefaultOpenMPPSource) { // Initialize default location for psource field of ident_t structure of // all ident_t objects. Format is ";file;function;line;column;;". // Taken from // http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp_str.c DefaultOpenMPPSource = CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer(); DefaultOpenMPPSource = llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy); } auto DefaultOpenMPLocation = new llvm::GlobalVariable( CGM.getModule(), IdentTy, /*isConstant*/ true, llvm::GlobalValue::PrivateLinkage, /*Initializer*/ nullptr); DefaultOpenMPLocation->setUnnamedAddr(true); DefaultOpenMPLocation->setAlignment(Align.getQuantity()); llvm::Constant *Zero = llvm::ConstantInt::get(CGM.Int32Ty, 0, true); llvm::Constant *Values[] = {Zero, llvm::ConstantInt::get(CGM.Int32Ty, Flags), Zero, Zero, DefaultOpenMPPSource}; llvm::Constant *Init = llvm::ConstantStruct::get(IdentTy, Values); DefaultOpenMPLocation->setInitializer(Init); OpenMPDefaultLocMap[Flags] = Entry = DefaultOpenMPLocation; } return Address(Entry, Align); } llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, OpenMPLocationFlags Flags) { // If no debug info is generated - return global default location. if (CGM.getCodeGenOpts().getDebugInfo() == CodeGenOptions::NoDebugInfo || Loc.isInvalid()) return getOrCreateDefaultLocation(Flags).getPointer(); assert(CGF.CurFn && "No function in current CodeGenFunction."); Address LocValue = Address::invalid(); auto I = OpenMPLocThreadIDMap.find(CGF.CurFn); if (I != OpenMPLocThreadIDMap.end()) LocValue = Address(I->second.DebugLoc, getIdentAlign(CGF.CGM)); // OpenMPLocThreadIDMap may have null DebugLoc and non-null ThreadID, if // GetOpenMPThreadID was called before this routine. if (!LocValue.isValid()) { // Generate "ident_t .kmpc_loc.addr;" Address AI = CGF.CreateTempAlloca(IdentTy, getIdentAlign(CGF.CGM), ".kmpc_loc.addr"); auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); Elem.second.DebugLoc = AI.getPointer(); LocValue = AI; CGBuilderTy::InsertPointGuard IPG(CGF.Builder); CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt); CGF.Builder.CreateMemCpy(LocValue, getOrCreateDefaultLocation(Flags), CGM.getSize(getIdentSize(CGF.CGM))); } // char **psource = &.kmpc_loc_.addr.psource; Address PSource = createIdentFieldGEP(CGF, LocValue, IdentField_PSource); auto OMPDebugLoc = OpenMPDebugLocMap.lookup(Loc.getRawEncoding()); if (OMPDebugLoc == nullptr) { SmallString<128> Buffer2; llvm::raw_svector_ostream OS2(Buffer2); // Build debug location PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc); OS2 << ";" << PLoc.getFilename() << ";"; if (const FunctionDecl *FD = dyn_cast_or_null(CGF.CurFuncDecl)) { OS2 << FD->getQualifiedNameAsString(); } OS2 << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;"; OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr(OS2.str()); OpenMPDebugLocMap[Loc.getRawEncoding()] = OMPDebugLoc; } // *psource = ";;;;;;"; CGF.Builder.CreateStore(OMPDebugLoc, PSource); // Our callers always pass this to a runtime function, so for // convenience, go ahead and return a naked pointer. return LocValue.getPointer(); } llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF, SourceLocation Loc) { assert(CGF.CurFn && "No function in current CodeGenFunction."); llvm::Value *ThreadID = nullptr; // Check whether we've already cached a load of the thread id in this // function. auto I = OpenMPLocThreadIDMap.find(CGF.CurFn); if (I != OpenMPLocThreadIDMap.end()) { ThreadID = I->second.ThreadID; if (ThreadID != nullptr) return ThreadID; } if (auto OMPRegionInfo = dyn_cast_or_null(CGF.CapturedStmtInfo)) { if (OMPRegionInfo->getThreadIDVariable()) { // Check if this an outlined function with thread id passed as argument. auto LVal = OMPRegionInfo->getThreadIDVariableLValue(CGF); ThreadID = CGF.EmitLoadOfLValue(LVal, Loc).getScalarVal(); // If value loaded in entry block, cache it and use it everywhere in // function. if (CGF.Builder.GetInsertBlock() == CGF.AllocaInsertPt->getParent()) { auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); Elem.second.ThreadID = ThreadID; } return ThreadID; } } // This is not an outlined function region - need to call __kmpc_int32 // kmpc_global_thread_num(ident_t *loc). // Generate thread id value and cache this value for use across the // function. CGBuilderTy::InsertPointGuard IPG(CGF.Builder); CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt); ThreadID = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num), emitUpdateLocation(CGF, Loc)); auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); Elem.second.ThreadID = ThreadID; return ThreadID; } void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) { assert(CGF.CurFn && "No function in current CodeGenFunction."); if (OpenMPLocThreadIDMap.count(CGF.CurFn)) OpenMPLocThreadIDMap.erase(CGF.CurFn); } llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() { return llvm::PointerType::getUnqual(IdentTy); } llvm::Type *CGOpenMPRuntime::getKmpc_MicroPointerTy() { return llvm::PointerType::getUnqual(Kmpc_MicroTy); } llvm::Constant * CGOpenMPRuntime::createRuntimeFunction(OpenMPRTLFunction Function) { llvm::Constant *RTLFn = nullptr; switch (Function) { case OMPRTL__kmpc_fork_call: { // Build void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro // microtask, ...); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, getKmpc_MicroPointerTy()}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_call"); break; } case OMPRTL__kmpc_global_thread_num: { // Build kmp_int32 __kmpc_global_thread_num(ident_t *loc); llvm::Type *TypeParams[] = {getIdentTyPointerTy()}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_global_thread_num"); break; } case OMPRTL__kmpc_threadprivate_cached: { // Build void *__kmpc_threadprivate_cached(ident_t *loc, // kmp_int32 global_tid, void *data, size_t size, void ***cache); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy, CGM.SizeTy, CGM.VoidPtrTy->getPointerTo()->getPointerTo()}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_cached"); break; } case OMPRTL__kmpc_critical: { // Build void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, // kmp_critical_name *crit); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, llvm::PointerType::getUnqual(KmpCriticalNameTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical"); break; } case OMPRTL__kmpc_critical_with_hint: { // Build void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, // kmp_critical_name *crit, uintptr_t hint); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, llvm::PointerType::getUnqual(KmpCriticalNameTy), CGM.IntPtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical_with_hint"); break; } case OMPRTL__kmpc_threadprivate_register: { // Build void __kmpc_threadprivate_register(ident_t *, void *data, // kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor); // typedef void *(*kmpc_ctor)(void *); auto KmpcCtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy, /*isVarArg*/ false)->getPointerTo(); // typedef void *(*kmpc_cctor)(void *, void *); llvm::Type *KmpcCopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy}; auto KmpcCopyCtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, KmpcCopyCtorTyArgs, /*isVarArg*/ false)->getPointerTo(); // typedef void (*kmpc_dtor)(void *); auto KmpcDtorTy = llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy, /*isVarArg*/ false) ->getPointerTo(); llvm::Type *FnTyArgs[] = {getIdentTyPointerTy(), CGM.VoidPtrTy, KmpcCtorTy, KmpcCopyCtorTy, KmpcDtorTy}; auto FnTy = llvm::FunctionType::get(CGM.VoidTy, FnTyArgs, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_register"); break; } case OMPRTL__kmpc_end_critical: { // Build void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, // kmp_critical_name *crit); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, llvm::PointerType::getUnqual(KmpCriticalNameTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_critical"); break; } case OMPRTL__kmpc_cancel_barrier: { // Build kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32 // global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_cancel_barrier"); break; } case OMPRTL__kmpc_barrier: { // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier"); break; } case OMPRTL__kmpc_for_static_fini: { // Build void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_for_static_fini"); break; } case OMPRTL__kmpc_push_num_threads: { // Build void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, // kmp_int32 num_threads) llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_threads"); break; } case OMPRTL__kmpc_serialized_parallel: { // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 // global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); break; } case OMPRTL__kmpc_end_serialized_parallel: { // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 // global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); break; } case OMPRTL__kmpc_flush: { // Build void __kmpc_flush(ident_t *loc); llvm::Type *TypeParams[] = {getIdentTyPointerTy()}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_flush"); break; } case OMPRTL__kmpc_master: { // Build kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_master"); break; } case OMPRTL__kmpc_end_master: { // Build void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_master"); break; } case OMPRTL__kmpc_omp_taskyield: { // Build kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid, // int end_part); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_taskyield"); break; } case OMPRTL__kmpc_single: { // Build kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_single"); break; } case OMPRTL__kmpc_end_single: { // Build void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_single"); break; } case OMPRTL__kmpc_omp_task_alloc: { // Build kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid, // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, // kmp_routine_entry_t *task_entry); assert(KmpRoutineEntryPtrTy != nullptr && "Type kmp_routine_entry_t must be created."); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy, CGM.SizeTy, KmpRoutineEntryPtrTy}; // Return void * and then cast to particular kmp_task_t type. llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_alloc"); break; } case OMPRTL__kmpc_omp_task: { // Build kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t // *new_task); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task"); break; } case OMPRTL__kmpc_copyprivate: { // Build void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid, // size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), // kmp_int32 didit); llvm::Type *CpyTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy}; auto *CpyFnTy = llvm::FunctionType::get(CGM.VoidTy, CpyTypeParams, /*isVarArg=*/false); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.SizeTy, CGM.VoidPtrTy, CpyFnTy->getPointerTo(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_copyprivate"); break; } case OMPRTL__kmpc_reduce: { // Build kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, // kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void // (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck); llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy}; auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams, /*isVarArg=*/false); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy, CGM.VoidPtrTy, ReduceFnTy->getPointerTo(), llvm::PointerType::getUnqual(KmpCriticalNameTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_reduce"); break; } case OMPRTL__kmpc_reduce_nowait: { // Build kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 // global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, // void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name // *lck); llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy}; auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams, /*isVarArg=*/false); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy, CGM.VoidPtrTy, ReduceFnTy->getPointerTo(), llvm::PointerType::getUnqual(KmpCriticalNameTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_reduce_nowait"); break; } case OMPRTL__kmpc_end_reduce: { // Build void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, // kmp_critical_name *lck); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, llvm::PointerType::getUnqual(KmpCriticalNameTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_reduce"); break; } case OMPRTL__kmpc_end_reduce_nowait: { // Build __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, // kmp_critical_name *lck); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, llvm::PointerType::getUnqual(KmpCriticalNameTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_reduce_nowait"); break; } case OMPRTL__kmpc_omp_task_begin_if0: { // Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t // *new_task); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_begin_if0"); break; } case OMPRTL__kmpc_omp_task_complete_if0: { // Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t // *new_task); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_complete_if0"); break; } case OMPRTL__kmpc_ordered: { // Build void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_ordered"); break; } case OMPRTL__kmpc_end_ordered: { // Build void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_ordered"); break; } case OMPRTL__kmpc_omp_taskwait: { // Build kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_omp_taskwait"); break; } case OMPRTL__kmpc_taskgroup: { // Build void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_taskgroup"); break; } case OMPRTL__kmpc_end_taskgroup: { // Build void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_taskgroup"); break; } case OMPRTL__kmpc_push_proc_bind: { // Build void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, // int proc_bind) llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_proc_bind"); break; } case OMPRTL__kmpc_omp_task_with_deps: { // Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid, // kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list, // kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_with_deps"); break; } case OMPRTL__kmpc_omp_wait_deps: { // Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid, // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias, // kmp_depend_info_t *noalias_dep_list); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_wait_deps"); break; } case OMPRTL__kmpc_cancellationpoint: { // Build kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32 // global_tid, kmp_int32 cncl_kind) llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancellationpoint"); break; } case OMPRTL__kmpc_cancel: { // Build kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid, // kmp_int32 cncl_kind) llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel"); break; } case OMPRTL__tgt_target: { // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t // *arg_types); llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrPtrTy, CGM.VoidPtrPtrTy, CGM.SizeTy->getPointerTo(), CGM.Int32Ty->getPointerTo()}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target"); break; } case OMPRTL__tgt_register_lib: { // Build void __tgt_register_lib(__tgt_bin_desc *desc); QualType ParamTy = CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy()); llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_lib"); break; } case OMPRTL__tgt_unregister_lib: { // Build void __tgt_unregister_lib(__tgt_bin_desc *desc); QualType ParamTy = CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy()); llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib"); break; } } return RTLFn; } static llvm::Value *getTypeSize(CodeGenFunction &CGF, QualType Ty) { auto &C = CGF.getContext(); llvm::Value *Size = nullptr; auto SizeInChars = C.getTypeSizeInChars(Ty); if (SizeInChars.isZero()) { // getTypeSizeInChars() returns 0 for a VLA. while (auto *VAT = C.getAsVariableArrayType(Ty)) { llvm::Value *ArraySize; std::tie(ArraySize, Ty) = CGF.getVLASize(VAT); Size = Size ? CGF.Builder.CreateNUWMul(Size, ArraySize) : ArraySize; } SizeInChars = C.getTypeSizeInChars(Ty); assert(!SizeInChars.isZero()); Size = CGF.Builder.CreateNUWMul( Size, llvm::ConstantInt::get(CGF.SizeTy, SizeInChars.getQuantity())); } else Size = llvm::ConstantInt::get(CGF.SizeTy, SizeInChars.getQuantity()); return Size; } llvm::Constant *CGOpenMPRuntime::createForStaticInitFunction(unsigned IVSize, bool IVSigned) { assert((IVSize == 32 || IVSize == 64) && "IV size is not compatible with the omp runtime"); auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_for_static_init_4" : "__kmpc_for_static_init_4u") : (IVSigned ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_8u"); auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty; auto PtrTy = llvm::PointerType::getUnqual(ITy); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc CGM.Int32Ty, // tid CGM.Int32Ty, // schedtype llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter PtrTy, // p_lower PtrTy, // p_upper PtrTy, // p_stride ITy, // incr ITy // chunk }; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); return CGM.CreateRuntimeFunction(FnTy, Name); } llvm::Constant *CGOpenMPRuntime::createDispatchInitFunction(unsigned IVSize, bool IVSigned) { assert((IVSize == 32 || IVSize == 64) && "IV size is not compatible with the omp runtime"); auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_dispatch_init_4" : "__kmpc_dispatch_init_4u") : (IVSigned ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_8u"); auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty; llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc CGM.Int32Ty, // tid CGM.Int32Ty, // schedtype ITy, // lower ITy, // upper ITy, // stride ITy // chunk }; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); return CGM.CreateRuntimeFunction(FnTy, Name); } llvm::Constant *CGOpenMPRuntime::createDispatchFiniFunction(unsigned IVSize, bool IVSigned) { assert((IVSize == 32 || IVSize == 64) && "IV size is not compatible with the omp runtime"); auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_dispatch_fini_4" : "__kmpc_dispatch_fini_4u") : (IVSigned ? "__kmpc_dispatch_fini_8" : "__kmpc_dispatch_fini_8u"); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc CGM.Int32Ty, // tid }; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); return CGM.CreateRuntimeFunction(FnTy, Name); } llvm::Constant *CGOpenMPRuntime::createDispatchNextFunction(unsigned IVSize, bool IVSigned) { assert((IVSize == 32 || IVSize == 64) && "IV size is not compatible with the omp runtime"); auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_dispatch_next_4" : "__kmpc_dispatch_next_4u") : (IVSigned ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_8u"); auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty; auto PtrTy = llvm::PointerType::getUnqual(ITy); llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc CGM.Int32Ty, // tid llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter PtrTy, // p_lower PtrTy, // p_upper PtrTy // p_stride }; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); return CGM.CreateRuntimeFunction(FnTy, Name); } llvm::Constant * CGOpenMPRuntime::getOrCreateThreadPrivateCache(const VarDecl *VD) { assert(!CGM.getLangOpts().OpenMPUseTLS || !CGM.getContext().getTargetInfo().isTLSSupported()); // Lookup the entry, lazily creating it if necessary. return getOrCreateInternalVariable(CGM.Int8PtrPtrTy, Twine(CGM.getMangledName(VD)) + ".cache."); } Address CGOpenMPRuntime::getAddrOfThreadPrivate(CodeGenFunction &CGF, const VarDecl *VD, Address VDAddr, SourceLocation Loc) { if (CGM.getLangOpts().OpenMPUseTLS && CGM.getContext().getTargetInfo().isTLSSupported()) return VDAddr; auto VarTy = VDAddr.getElementType(); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), CGF.Builder.CreatePointerCast(VDAddr.getPointer(), CGM.Int8PtrTy), CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)), getOrCreateThreadPrivateCache(VD)}; return Address(CGF.EmitRuntimeCall( createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args), VDAddr.getAlignment()); } void CGOpenMPRuntime::emitThreadPrivateVarInit( CodeGenFunction &CGF, Address VDAddr, llvm::Value *Ctor, llvm::Value *CopyCtor, llvm::Value *Dtor, SourceLocation Loc) { // Call kmp_int32 __kmpc_global_thread_num(&loc) to init OpenMP runtime // library. auto OMPLoc = emitUpdateLocation(CGF, Loc); CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num), OMPLoc); // Call __kmpc_threadprivate_register(&loc, &var, ctor, cctor/*NULL*/, dtor) // to register constructor/destructor for variable. llvm::Value *Args[] = {OMPLoc, CGF.Builder.CreatePointerCast(VDAddr.getPointer(), CGM.VoidPtrTy), Ctor, CopyCtor, Dtor}; CGF.EmitRuntimeCall( createRuntimeFunction(OMPRTL__kmpc_threadprivate_register), Args); } llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition( const VarDecl *VD, Address VDAddr, SourceLocation Loc, bool PerformInit, CodeGenFunction *CGF) { if (CGM.getLangOpts().OpenMPUseTLS && CGM.getContext().getTargetInfo().isTLSSupported()) return nullptr; VD = VD->getDefinition(CGM.getContext()); if (VD && ThreadPrivateWithDefinition.count(VD) == 0) { ThreadPrivateWithDefinition.insert(VD); QualType ASTTy = VD->getType(); llvm::Value *Ctor = nullptr, *CopyCtor = nullptr, *Dtor = nullptr; auto Init = VD->getAnyInitializer(); if (CGM.getLangOpts().CPlusPlus && PerformInit) { // Generate function that re-emits the declaration's initializer into the // threadprivate copy of the variable VD CodeGenFunction CtorCGF(CGM); FunctionArgList Args; ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, CGM.getContext().VoidPtrTy); Args.push_back(&Dst); auto &FI = CGM.getTypes().arrangeFreeFunctionDeclaration( CGM.getContext().VoidPtrTy, Args, FunctionType::ExtInfo(), /*isVariadic=*/false); auto FTy = CGM.getTypes().GetFunctionType(FI); auto Fn = CGM.CreateGlobalInitOrDestructFunction( FTy, ".__kmpc_global_ctor_.", FI, Loc); CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidPtrTy, Fn, FI, Args, SourceLocation()); auto ArgVal = CtorCGF.EmitLoadOfScalar( CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false, CGM.getContext().VoidPtrTy, Dst.getLocation()); Address Arg = Address(ArgVal, VDAddr.getAlignment()); Arg = CtorCGF.Builder.CreateElementBitCast(Arg, CtorCGF.ConvertTypeForMem(ASTTy)); CtorCGF.EmitAnyExprToMem(Init, Arg, Init->getType().getQualifiers(), /*IsInitializer=*/true); ArgVal = CtorCGF.EmitLoadOfScalar( CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false, CGM.getContext().VoidPtrTy, Dst.getLocation()); CtorCGF.Builder.CreateStore(ArgVal, CtorCGF.ReturnValue); CtorCGF.FinishFunction(); Ctor = Fn; } if (VD->getType().isDestructedType() != QualType::DK_none) { // Generate function that emits destructor call for the threadprivate copy // of the variable VD CodeGenFunction DtorCGF(CGM); FunctionArgList Args; ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, CGM.getContext().VoidPtrTy); Args.push_back(&Dst); auto &FI = CGM.getTypes().arrangeFreeFunctionDeclaration( CGM.getContext().VoidTy, Args, FunctionType::ExtInfo(), /*isVariadic=*/false); auto FTy = CGM.getTypes().GetFunctionType(FI); auto Fn = CGM.CreateGlobalInitOrDestructFunction( FTy, ".__kmpc_global_dtor_.", FI, Loc); DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI, Args, SourceLocation()); auto ArgVal = DtorCGF.EmitLoadOfScalar( DtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false, CGM.getContext().VoidPtrTy, Dst.getLocation()); DtorCGF.emitDestroy(Address(ArgVal, VDAddr.getAlignment()), ASTTy, DtorCGF.getDestroyer(ASTTy.isDestructedType()), DtorCGF.needsEHCleanup(ASTTy.isDestructedType())); DtorCGF.FinishFunction(); Dtor = Fn; } // Do not emit init function if it is not required. if (!Ctor && !Dtor) return nullptr; llvm::Type *CopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy}; auto CopyCtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CopyCtorTyArgs, /*isVarArg=*/false)->getPointerTo(); // Copying constructor for the threadprivate variable. // Must be NULL - reserved by runtime, but currently it requires that this // parameter is always NULL. Otherwise it fires assertion. CopyCtor = llvm::Constant::getNullValue(CopyCtorTy); if (Ctor == nullptr) { auto CtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy, /*isVarArg=*/false)->getPointerTo(); Ctor = llvm::Constant::getNullValue(CtorTy); } if (Dtor == nullptr) { auto DtorTy = llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy, /*isVarArg=*/false)->getPointerTo(); Dtor = llvm::Constant::getNullValue(DtorTy); } if (!CGF) { auto InitFunctionTy = llvm::FunctionType::get(CGM.VoidTy, /*isVarArg*/ false); auto InitFunction = CGM.CreateGlobalInitOrDestructFunction( InitFunctionTy, ".__omp_threadprivate_init_.", CGM.getTypes().arrangeNullaryFunction()); CodeGenFunction InitCGF(CGM); FunctionArgList ArgList; InitCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, InitFunction, CGM.getTypes().arrangeNullaryFunction(), ArgList, Loc); emitThreadPrivateVarInit(InitCGF, VDAddr, Ctor, CopyCtor, Dtor, Loc); InitCGF.FinishFunction(); return InitFunction; } emitThreadPrivateVarInit(*CGF, VDAddr, Ctor, CopyCtor, Dtor, Loc); } return nullptr; } /// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen /// function. Here is the logic: /// if (Cond) { /// ThenGen(); /// } else { /// ElseGen(); /// } static void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen) { CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange()); // If the condition constant folds and can be elided, try to avoid emitting // the condition and the dead arm of the if/else. bool CondConstant; if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) { CodeGenFunction::RunCleanupsScope Scope(CGF); if (CondConstant) { ThenGen(CGF); } else { ElseGen(CGF); } return; } // Otherwise, the condition did not fold, or we couldn't elide it. Just // emit the conditional branch. auto ThenBlock = CGF.createBasicBlock("omp_if.then"); auto ElseBlock = CGF.createBasicBlock("omp_if.else"); auto ContBlock = CGF.createBasicBlock("omp_if.end"); CGF.EmitBranchOnBoolExpr(Cond, ThenBlock, ElseBlock, /*TrueCount=*/0); // Emit the 'then' code. CGF.EmitBlock(ThenBlock); { CodeGenFunction::RunCleanupsScope ThenScope(CGF); ThenGen(CGF); } CGF.EmitBranch(ContBlock); // Emit the 'else' code if present. { // There is no need to emit line number for unconditional branch. auto NL = ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBlock(ElseBlock); } { CodeGenFunction::RunCleanupsScope ThenScope(CGF); ElseGen(CGF); } { // There is no need to emit line number for unconditional branch. auto NL = ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBranch(ContBlock); } // Emit the continuation block for code after the if. CGF.EmitBlock(ContBlock, /*IsFinished=*/true); } void CGOpenMPRuntime::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { if (!CGF.HaveInsertPoint()) return; auto *RTLoc = emitUpdateLocation(CGF, Loc); auto &&ThenGen = [this, OutlinedFn, CapturedVars, RTLoc](CodeGenFunction &CGF) { // Build call __kmpc_fork_call(loc, n, microtask, var1, .., varn); llvm::Value *Args[] = { RTLoc, CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())}; llvm::SmallVector RealArgs; RealArgs.append(std::begin(Args), std::end(Args)); RealArgs.append(CapturedVars.begin(), CapturedVars.end()); auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_call); CGF.EmitRuntimeCall(RTLFn, RealArgs); }; auto &&ElseGen = [this, OutlinedFn, CapturedVars, RTLoc, Loc](CodeGenFunction &CGF) { auto ThreadID = getThreadID(CGF, Loc); // Build calls: // __kmpc_serialized_parallel(&Loc, GTid); llvm::Value *Args[] = {RTLoc, ThreadID}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_serialized_parallel), Args); // OutlinedFn(>id, &zero, CapturedStruct); auto ThreadIDAddr = emitThreadIDAddress(CGF, Loc); Address ZeroAddr = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), /*Name*/ ".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::SmallVector OutlinedFnArgs; OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); // __kmpc_end_serialized_parallel(&Loc, GTid); llvm::Value *EndArgs[] = {emitUpdateLocation(CGF, Loc), ThreadID}; CGF.EmitRuntimeCall( createRuntimeFunction(OMPRTL__kmpc_end_serialized_parallel), EndArgs); }; if (IfCond) { emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen); } else { CodeGenFunction::RunCleanupsScope Scope(CGF); ThenGen(CGF); } } // If we're inside an (outlined) parallel region, use the region info's // thread-ID variable (it is passed in a first argument of the outlined function // as "kmp_int32 *gtid"). Otherwise, if we're not inside parallel region, but in // regular serial code region, get thread ID by calling kmp_int32 // kmpc_global_thread_num(ident_t *loc), stash this thread ID in a temporary and // return the address of that temp. Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc) { if (auto OMPRegionInfo = dyn_cast_or_null(CGF.CapturedStmtInfo)) if (OMPRegionInfo->getThreadIDVariable()) return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress(); auto ThreadID = getThreadID(CGF, Loc); auto Int32Ty = CGF.getContext().getIntTypeForBitwidth(/*DestWidth*/ 32, /*Signed*/ true); auto ThreadIDTemp = CGF.CreateMemTemp(Int32Ty, /*Name*/ ".threadid_temp."); CGF.EmitStoreOfScalar(ThreadID, CGF.MakeAddrLValue(ThreadIDTemp, Int32Ty)); return ThreadIDTemp; } llvm::Constant * CGOpenMPRuntime::getOrCreateInternalVariable(llvm::Type *Ty, const llvm::Twine &Name) { SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); Out << Name; auto RuntimeName = Out.str(); auto &Elem = *InternalVars.insert(std::make_pair(RuntimeName, nullptr)).first; if (Elem.second) { assert(Elem.second->getType()->getPointerElementType() == Ty && "OMP internal variable has different type than requested"); return &*Elem.second; } return Elem.second = new llvm::GlobalVariable( CGM.getModule(), Ty, /*IsConstant*/ false, llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(Ty), Elem.first()); } llvm::Value *CGOpenMPRuntime::getCriticalRegionLock(StringRef CriticalName) { llvm::Twine Name(".gomp_critical_user_", CriticalName); return getOrCreateInternalVariable(KmpCriticalNameTy, Name.concat(".var")); } namespace { template class CallEndCleanup final : public EHScopeStack::Cleanup { llvm::Value *Callee; llvm::Value *Args[N]; public: CallEndCleanup(llvm::Value *Callee, ArrayRef CleanupArgs) : Callee(Callee) { assert(CleanupArgs.size() == N); std::copy(CleanupArgs.begin(), CleanupArgs.end(), std::begin(Args)); } void Emit(CodeGenFunction &CGF, Flags /*flags*/) override { if (!CGF.HaveInsertPoint()) return; CGF.EmitRuntimeCall(Callee, Args); } }; } // anonymous namespace void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint) { // __kmpc_critical[_with_hint](ident_t *, gtid, Lock[, hint]); // CriticalOpGen(); // __kmpc_end_critical(ident_t *, gtid, Lock); // Prepare arguments and build a call to __kmpc_critical if (!CGF.HaveInsertPoint()) return; CodeGenFunction::RunCleanupsScope Scope(CGF); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), getCriticalRegionLock(CriticalName)}; if (Hint) { llvm::SmallVector ArgsWithHint(std::begin(Args), std::end(Args)); auto *HintVal = CGF.EmitScalarExpr(Hint); ArgsWithHint.push_back( CGF.Builder.CreateIntCast(HintVal, CGM.IntPtrTy, /*isSigned=*/false)); CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_critical_with_hint), ArgsWithHint); } else CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_critical), Args); // Build a call to __kmpc_end_critical CGF.EHStack.pushCleanup::value>>( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_critical), llvm::makeArrayRef(Args)); emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen); } static void emitIfStmt(CodeGenFunction &CGF, llvm::Value *IfCond, OpenMPDirectiveKind Kind, SourceLocation Loc, const RegionCodeGenTy &BodyOpGen) { llvm::Value *CallBool = CGF.EmitScalarConversion( IfCond, CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true), CGF.getContext().BoolTy, Loc); auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); auto *ContBlock = CGF.createBasicBlock("omp_if.end"); // Generate the branch (If-stmt) CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); CGF.EmitBlock(ThenBlock); CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, Kind, BodyOpGen); // Emit the rest of bblocks/branches CGF.EmitBranch(ContBlock); CGF.EmitBlock(ContBlock, true); } void CGOpenMPRuntime::emitMasterRegion(CodeGenFunction &CGF, const RegionCodeGenTy &MasterOpGen, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // if(__kmpc_master(ident_t *, gtid)) { // MasterOpGen(); // __kmpc_end_master(ident_t *, gtid); // } // Prepare arguments and build a call to __kmpc_master llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)}; auto *IsMaster = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_master), Args); typedef CallEndCleanup::value> MasterCallEndCleanup; emitIfStmt( CGF, IsMaster, OMPD_master, Loc, [&](CodeGenFunction &CGF) -> void { CodeGenFunction::RunCleanupsScope Scope(CGF); CGF.EHStack.pushCleanup( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_master), llvm::makeArrayRef(Args)); MasterOpGen(CGF); }); } void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // Build call __kmpc_omp_taskyield(loc, thread_id, 0); llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), llvm::ConstantInt::get(CGM.IntTy, /*V=*/0, /*isSigned=*/true)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskyield), Args); } void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF, const RegionCodeGenTy &TaskgroupOpGen, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // __kmpc_taskgroup(ident_t *, gtid); // TaskgroupOpGen(); // __kmpc_end_taskgroup(ident_t *, gtid); // Prepare arguments and build a call to __kmpc_taskgroup { CodeGenFunction::RunCleanupsScope Scope(CGF); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskgroup), Args); // Build a call to __kmpc_end_taskgroup CGF.EHStack.pushCleanup::value>>( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_taskgroup), llvm::makeArrayRef(Args)); emitInlinedDirective(CGF, OMPD_taskgroup, TaskgroupOpGen); } } /// Given an array of pointers to variables, project the address of a /// given variable. static Address emitAddrOfVarFromArray(CodeGenFunction &CGF, Address Array, unsigned Index, const VarDecl *Var) { // Pull out the pointer to the variable. Address PtrAddr = CGF.Builder.CreateConstArrayGEP(Array, Index, CGF.getPointerSize()); llvm::Value *Ptr = CGF.Builder.CreateLoad(PtrAddr); Address Addr = Address(Ptr, CGF.getContext().getDeclAlign(Var)); Addr = CGF.Builder.CreateElementBitCast( Addr, CGF.ConvertTypeForMem(Var->getType())); return Addr; } static llvm::Value *emitCopyprivateCopyFunction( CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef CopyprivateVars, ArrayRef DestExprs, ArrayRef SrcExprs, ArrayRef AssignmentOps) { auto &C = CGM.getContext(); // void copy_func(void *LHSArg, void *RHSArg); FunctionArgList Args; ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, C.VoidPtrTy); ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, C.VoidPtrTy); Args.push_back(&LHSArg); Args.push_back(&RHSArg); FunctionType::ExtInfo EI; auto &CGFI = CGM.getTypes().arrangeFreeFunctionDeclaration( C.VoidTy, Args, EI, /*isVariadic=*/false); auto *Fn = llvm::Function::Create( CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, ".omp.copyprivate.copy_func", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); CodeGenFunction CGF(CGM); CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); // Dest = (void*[n])(LHSArg); // Src = (void*[n])(RHSArg); Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)), ArgsType), CGF.getPointerAlign()); Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)), ArgsType), CGF.getPointerAlign()); // *(Type0*)Dst[0] = *(Type0*)Src[0]; // *(Type1*)Dst[1] = *(Type1*)Src[1]; // ... // *(Typen*)Dst[n] = *(Typen*)Src[n]; for (unsigned I = 0, E = AssignmentOps.size(); I < E; ++I) { auto DestVar = cast(cast(DestExprs[I])->getDecl()); Address DestAddr = emitAddrOfVarFromArray(CGF, LHS, I, DestVar); auto SrcVar = cast(cast(SrcExprs[I])->getDecl()); Address SrcAddr = emitAddrOfVarFromArray(CGF, RHS, I, SrcVar); auto *VD = cast(CopyprivateVars[I])->getDecl(); QualType Type = VD->getType(); CGF.EmitOMPCopy(Type, DestAddr, SrcAddr, DestVar, SrcVar, AssignmentOps[I]); } CGF.FinishFunction(); return Fn; } void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF, const RegionCodeGenTy &SingleOpGen, SourceLocation Loc, ArrayRef CopyprivateVars, ArrayRef SrcExprs, ArrayRef DstExprs, ArrayRef AssignmentOps) { if (!CGF.HaveInsertPoint()) return; assert(CopyprivateVars.size() == SrcExprs.size() && CopyprivateVars.size() == DstExprs.size() && CopyprivateVars.size() == AssignmentOps.size()); auto &C = CGM.getContext(); // int32 did_it = 0; // if(__kmpc_single(ident_t *, gtid)) { // SingleOpGen(); // __kmpc_end_single(ident_t *, gtid); // did_it = 1; // } // call __kmpc_copyprivate(ident_t *, gtid, , , // , did_it); Address DidIt = Address::invalid(); if (!CopyprivateVars.empty()) { // int32 did_it = 0; auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1); DidIt = CGF.CreateMemTemp(KmpInt32Ty, ".omp.copyprivate.did_it"); CGF.Builder.CreateStore(CGF.Builder.getInt32(0), DidIt); } // Prepare arguments and build a call to __kmpc_single llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)}; auto *IsSingle = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_single), Args); typedef CallEndCleanup::value> SingleCallEndCleanup; emitIfStmt( CGF, IsSingle, OMPD_single, Loc, [&](CodeGenFunction &CGF) -> void { CodeGenFunction::RunCleanupsScope Scope(CGF); CGF.EHStack.pushCleanup( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_single), llvm::makeArrayRef(Args)); SingleOpGen(CGF); if (DidIt.isValid()) { // did_it = 1; CGF.Builder.CreateStore(CGF.Builder.getInt32(1), DidIt); } }); // call __kmpc_copyprivate(ident_t *, gtid, , , // , did_it); if (DidIt.isValid()) { llvm::APInt ArraySize(/*unsigned int numBits=*/32, CopyprivateVars.size()); auto CopyprivateArrayTy = C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, /*IndexTypeQuals=*/0); // Create a list of all private variables for copyprivate. Address CopyprivateList = CGF.CreateMemTemp(CopyprivateArrayTy, ".omp.copyprivate.cpr_list"); for (unsigned I = 0, E = CopyprivateVars.size(); I < E; ++I) { Address Elem = CGF.Builder.CreateConstArrayGEP( CopyprivateList, I, CGF.getPointerSize()); CGF.Builder.CreateStore( CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.EmitLValue(CopyprivateVars[I]).getPointer(), CGF.VoidPtrTy), Elem); } // Build function that copies private values from single region to all other // threads in the corresponding parallel region. auto *CpyFn = emitCopyprivateCopyFunction( CGM, CGF.ConvertTypeForMem(CopyprivateArrayTy)->getPointerTo(), CopyprivateVars, SrcExprs, DstExprs, AssignmentOps); auto *BufSize = getTypeSize(CGF, CopyprivateArrayTy); Address CL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(CopyprivateList, CGF.VoidPtrTy); auto *DidItVal = CGF.Builder.CreateLoad(DidIt); llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), // ident_t * getThreadID(CGF, Loc), // i32 BufSize, // size_t CL.getPointer(), // void * CpyFn, // void (*) (void *, void *) DidItVal // i32 did_it }; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_copyprivate), Args); } } void CGOpenMPRuntime::emitOrderedRegion(CodeGenFunction &CGF, const RegionCodeGenTy &OrderedOpGen, SourceLocation Loc, bool IsThreads) { if (!CGF.HaveInsertPoint()) return; // __kmpc_ordered(ident_t *, gtid); // OrderedOpGen(); // __kmpc_end_ordered(ident_t *, gtid); // Prepare arguments and build a call to __kmpc_ordered CodeGenFunction::RunCleanupsScope Scope(CGF); if (IsThreads) { llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_ordered), Args); // Build a call to __kmpc_end_ordered CGF.EHStack.pushCleanup::value>>( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_ordered), llvm::makeArrayRef(Args)); } emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen); } void CGOpenMPRuntime::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks, bool ForceSimpleCall) { if (!CGF.HaveInsertPoint()) return; // Build call __kmpc_cancel_barrier(loc, thread_id); // Build call __kmpc_barrier(loc, thread_id); OpenMPLocationFlags Flags = OMP_IDENT_KMPC; if (Kind == OMPD_for) { Flags = static_cast(Flags | OMP_IDENT_BARRIER_IMPL_FOR); } else if (Kind == OMPD_sections) { Flags = static_cast(Flags | OMP_IDENT_BARRIER_IMPL_SECTIONS); } else if (Kind == OMPD_single) { Flags = static_cast(Flags | OMP_IDENT_BARRIER_IMPL_SINGLE); } else if (Kind == OMPD_barrier) { Flags = static_cast(Flags | OMP_IDENT_BARRIER_EXPL); } else { Flags = static_cast(Flags | OMP_IDENT_BARRIER_IMPL); } // Build call __kmpc_cancel_barrier(loc, thread_id) or __kmpc_barrier(loc, // thread_id); auto *OMPRegionInfo = dyn_cast_or_null(CGF.CapturedStmtInfo); // Do not emit barrier call in the single directive emitted in some rare cases // for sections directives. if (OMPRegionInfo && OMPRegionInfo->getDirectiveKind() == OMPD_single) return; llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), getThreadID(CGF, Loc)}; if (OMPRegionInfo) { if (!ForceSimpleCall && OMPRegionInfo->hasCancel()) { auto *Result = CGF.EmitRuntimeCall( createRuntimeFunction(OMPRTL__kmpc_cancel_barrier), Args); if (EmitChecks) { // if (__kmpc_cancel_barrier()) { // exit from construct; // } auto *ExitBB = CGF.createBasicBlock(".cancel.exit"); auto *ContBB = CGF.createBasicBlock(".cancel.continue"); auto *Cmp = CGF.Builder.CreateIsNotNull(Result); CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB); CGF.EmitBlock(ExitBB); // exit from construct; auto CancelDestination = CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind()); CGF.EmitBranchThroughCleanup(CancelDestination); CGF.EmitBlock(ContBB, /*IsFinished=*/true); } return; } } CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_barrier), Args); } /// \brief Schedule types for 'omp for' loops (these enumerators are taken from /// the enum sched_type in kmp.h). enum OpenMPSchedType { /// \brief Lower bound for default (unordered) versions. OMP_sch_lower = 32, OMP_sch_static_chunked = 33, OMP_sch_static = 34, OMP_sch_dynamic_chunked = 35, OMP_sch_guided_chunked = 36, OMP_sch_runtime = 37, OMP_sch_auto = 38, /// \brief Lower bound for 'ordered' versions. OMP_ord_lower = 64, OMP_ord_static_chunked = 65, OMP_ord_static = 66, OMP_ord_dynamic_chunked = 67, OMP_ord_guided_chunked = 68, OMP_ord_runtime = 69, OMP_ord_auto = 70, OMP_sch_default = OMP_sch_static, }; /// \brief Map the OpenMP loop schedule to the runtime enumeration. static OpenMPSchedType getRuntimeSchedule(OpenMPScheduleClauseKind ScheduleKind, bool Chunked, bool Ordered) { switch (ScheduleKind) { case OMPC_SCHEDULE_static: return Chunked ? (Ordered ? OMP_ord_static_chunked : OMP_sch_static_chunked) : (Ordered ? OMP_ord_static : OMP_sch_static); case OMPC_SCHEDULE_dynamic: return Ordered ? OMP_ord_dynamic_chunked : OMP_sch_dynamic_chunked; case OMPC_SCHEDULE_guided: return Ordered ? OMP_ord_guided_chunked : OMP_sch_guided_chunked; case OMPC_SCHEDULE_runtime: return Ordered ? OMP_ord_runtime : OMP_sch_runtime; case OMPC_SCHEDULE_auto: return Ordered ? OMP_ord_auto : OMP_sch_auto; case OMPC_SCHEDULE_unknown: assert(!Chunked && "chunk was specified but schedule kind not known"); return Ordered ? OMP_ord_static : OMP_sch_static; } llvm_unreachable("Unexpected runtime schedule"); } bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind, bool Chunked) const { auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked, /*Ordered=*/false); return Schedule == OMP_sch_static; } bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const { auto Schedule = getRuntimeSchedule(ScheduleKind, /*Chunked=*/false, /*Ordered=*/false); assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here"); return Schedule != OMP_sch_static; } void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF, SourceLocation Loc, OpenMPScheduleClauseKind ScheduleKind, unsigned IVSize, bool IVSigned, bool Ordered, llvm::Value *UB, llvm::Value *Chunk) { if (!CGF.HaveInsertPoint()) return; OpenMPSchedType Schedule = getRuntimeSchedule(ScheduleKind, Chunk != nullptr, Ordered); assert(Ordered || (Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked && Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked)); // Call __kmpc_dispatch_init( // ident_t *loc, kmp_int32 tid, kmp_int32 schedule, // kmp_int[32|64] lower, kmp_int[32|64] upper, // kmp_int[32|64] stride, kmp_int[32|64] chunk); // If the Chunk was not specified in the clause - use default value 1. if (Chunk == nullptr) Chunk = CGF.Builder.getIntN(IVSize, 1); llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC), getThreadID(CGF, Loc), CGF.Builder.getInt32(Schedule), // Schedule type CGF.Builder.getIntN(IVSize, 0), // Lower UB, // Upper CGF.Builder.getIntN(IVSize, 1), // Stride Chunk // Chunk }; CGF.EmitRuntimeCall(createDispatchInitFunction(IVSize, IVSigned), Args); } void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF, SourceLocation Loc, OpenMPScheduleClauseKind ScheduleKind, unsigned IVSize, bool IVSigned, bool Ordered, Address IL, Address LB, Address UB, Address ST, llvm::Value *Chunk) { if (!CGF.HaveInsertPoint()) return; OpenMPSchedType Schedule = getRuntimeSchedule(ScheduleKind, Chunk != nullptr, Ordered); assert(!Ordered); assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked || Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked); // Call __kmpc_for_static_init( // ident_t *loc, kmp_int32 tid, kmp_int32 schedtype, // kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower, // kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride, // kmp_int[32|64] incr, kmp_int[32|64] chunk); if (Chunk == nullptr) { assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static) && "expected static non-chunked schedule"); // If the Chunk was not specified in the clause - use default value 1. Chunk = CGF.Builder.getIntN(IVSize, 1); } else { assert((Schedule == OMP_sch_static_chunked || Schedule == OMP_ord_static_chunked) && "expected static chunked schedule"); } llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC), getThreadID(CGF, Loc), CGF.Builder.getInt32(Schedule), // Schedule type IL.getPointer(), // &isLastIter LB.getPointer(), // &LB UB.getPointer(), // &UB ST.getPointer(), // &Stride CGF.Builder.getIntN(IVSize, 1), // Incr Chunk // Chunk }; CGF.EmitRuntimeCall(createForStaticInitFunction(IVSize, IVSigned), Args); } void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC), getThreadID(CGF, Loc)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_for_static_fini), Args); } void CGOpenMPRuntime::emitForOrderedIterationEnd(CodeGenFunction &CGF, SourceLocation Loc, unsigned IVSize, bool IVSigned) { if (!CGF.HaveInsertPoint()) return; // Call __kmpc_for_dynamic_fini_(4|8)[u](ident_t *loc, kmp_int32 tid); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC), getThreadID(CGF, Loc)}; CGF.EmitRuntimeCall(createDispatchFiniFunction(IVSize, IVSigned), Args); } llvm::Value *CGOpenMPRuntime::emitForNext(CodeGenFunction &CGF, SourceLocation Loc, unsigned IVSize, bool IVSigned, Address IL, Address LB, Address UB, Address ST) { // Call __kmpc_dispatch_next( // ident_t *loc, kmp_int32 tid, kmp_int32 *p_lastiter, // kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper, // kmp_int[32|64] *p_stride); llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC), getThreadID(CGF, Loc), IL.getPointer(), // &isLastIter LB.getPointer(), // &Lower UB.getPointer(), // &Upper ST.getPointer() // &Stride }; llvm::Value *Call = CGF.EmitRuntimeCall(createDispatchNextFunction(IVSize, IVSigned), Args); return CGF.EmitScalarConversion( Call, CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ true), CGF.getContext().BoolTy, Loc); } void CGOpenMPRuntime::emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // Build call __kmpc_push_num_threads(&loc, global_tid, num_threads) llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), CGF.Builder.CreateIntCast(NumThreads, CGF.Int32Ty, /*isSigned*/ true)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_threads), Args); } void CGOpenMPRuntime::emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // Constants for proc bind value accepted by the runtime. enum ProcBindTy { ProcBindFalse = 0, ProcBindTrue, ProcBindMaster, ProcBindClose, ProcBindSpread, ProcBindIntel, ProcBindDefault } RuntimeProcBind; switch (ProcBind) { case OMPC_PROC_BIND_master: RuntimeProcBind = ProcBindMaster; break; case OMPC_PROC_BIND_close: RuntimeProcBind = ProcBindClose; break; case OMPC_PROC_BIND_spread: RuntimeProcBind = ProcBindSpread; break; case OMPC_PROC_BIND_unknown: llvm_unreachable("Unsupported proc_bind value."); } // Build call __kmpc_push_proc_bind(&loc, global_tid, proc_bind) llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), llvm::ConstantInt::get(CGM.IntTy, RuntimeProcBind, /*isSigned=*/true)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_proc_bind), Args); } void CGOpenMPRuntime::emitFlush(CodeGenFunction &CGF, ArrayRef, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // Build call void __kmpc_flush(ident_t *loc) CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_flush), emitUpdateLocation(CGF, Loc)); } namespace { /// \brief Indexes of fields for type kmp_task_t. enum KmpTaskTFields { /// \brief List of shared variables. KmpTaskTShareds, /// \brief Task routine. KmpTaskTRoutine, /// \brief Partition id for the untied tasks. KmpTaskTPartId, /// \brief Function with call of destructors for private variables. KmpTaskTDestructors, }; } // anonymous namespace bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::empty() const { // FIXME: Add other entries type when they become supported. return OffloadEntriesTargetRegion.empty(); } /// \brief Initialize target region entry. void CGOpenMPRuntime::OffloadEntriesInfoManagerTy:: initializeTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned LineNum, unsigned ColNum, unsigned Order) { assert(CGM.getLangOpts().OpenMPIsDevice && "Initialization of entries is " "only required for the device " "code generation."); OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum][ColNum] = OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr); ++OffloadingEntriesNum; } void CGOpenMPRuntime::OffloadEntriesInfoManagerTy:: registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned LineNum, unsigned ColNum, llvm::Constant *Addr, llvm::Constant *ID) { // If we are emitting code for a target, the entry is already initialized, // only has to be registered. if (CGM.getLangOpts().OpenMPIsDevice) { assert(hasTargetRegionEntryInfo(DeviceID, FileID, ParentName, LineNum, ColNum) && "Entry must exist."); auto &Entry = OffloadEntriesTargetRegion[DeviceID][FileID][ParentName] [LineNum][ColNum]; assert(Entry.isValid() && "Entry not initialized!"); Entry.setAddress(Addr); Entry.setID(ID); return; } else { OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID); OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum][ColNum] = Entry; } } bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::hasTargetRegionEntryInfo( unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned LineNum, unsigned ColNum) const { auto PerDevice = OffloadEntriesTargetRegion.find(DeviceID); if (PerDevice == OffloadEntriesTargetRegion.end()) return false; auto PerFile = PerDevice->second.find(FileID); if (PerFile == PerDevice->second.end()) return false; auto PerParentName = PerFile->second.find(ParentName); if (PerParentName == PerFile->second.end()) return false; auto PerLine = PerParentName->second.find(LineNum); if (PerLine == PerParentName->second.end()) return false; auto PerColumn = PerLine->second.find(ColNum); if (PerColumn == PerLine->second.end()) return false; // Fail if this entry is already registered. if (PerColumn->second.getAddress() || PerColumn->second.getID()) return false; return true; } void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::actOnTargetRegionEntriesInfo( const OffloadTargetRegionEntryInfoActTy &Action) { // Scan all target region entries and perform the provided action. for (auto &D : OffloadEntriesTargetRegion) for (auto &F : D.second) for (auto &P : F.second) for (auto &L : P.second) for (auto &C : L.second) Action(D.first, F.first, P.first(), L.first, C.first, C.second); } /// \brief Create a Ctor/Dtor-like function whose body is emitted through /// \a Codegen. This is used to emit the two functions that register and /// unregister the descriptor of the current compilation unit. static llvm::Function * createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name, const RegionCodeGenTy &Codegen) { auto &C = CGM.getContext(); FunctionArgList Args; ImplicitParamDecl DummyPtr(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, C.VoidPtrTy); Args.push_back(&DummyPtr); CodeGenFunction CGF(CGM); GlobalDecl(); auto &FI = CGM.getTypes().arrangeFreeFunctionDeclaration( C.VoidTy, Args, FunctionType::ExtInfo(), /*isVariadic=*/false); auto FTy = CGM.getTypes().GetFunctionType(FI); auto *Fn = CGM.CreateGlobalInitOrDestructFunction(FTy, Name, FI, SourceLocation()); CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FI, Args, SourceLocation()); Codegen(CGF); CGF.FinishFunction(); return Fn; } llvm::Function * CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() { // If we don't have entries or if we are emitting code for the device, we // don't need to do anything. if (CGM.getLangOpts().OpenMPIsDevice || OffloadEntriesInfoManager.empty()) return nullptr; auto &M = CGM.getModule(); auto &C = CGM.getContext(); // Get list of devices we care about auto &Devices = CGM.getLangOpts().OMPTargetTriples; // We should be creating an offloading descriptor only if there are devices // specified. assert(!Devices.empty() && "No OpenMP offloading devices??"); // Create the external variables that will point to the begin and end of the // host entries section. These will be defined by the linker. auto *OffloadEntryTy = CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy()); llvm::GlobalVariable *HostEntriesBegin = new llvm::GlobalVariable( M, OffloadEntryTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, /*Initializer=*/0, ".omp_offloading.entries_begin"); llvm::GlobalVariable *HostEntriesEnd = new llvm::GlobalVariable( M, OffloadEntryTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, /*Initializer=*/0, ".omp_offloading.entries_end"); // Create all device images llvm::SmallVector DeviceImagesEntires; auto *DeviceImageTy = cast( CGM.getTypes().ConvertTypeForMem(getTgtDeviceImageQTy())); for (unsigned i = 0; i < Devices.size(); ++i) { StringRef T = Devices[i].getTriple(); auto *ImgBegin = new llvm::GlobalVariable( M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, /*Initializer=*/0, Twine(".omp_offloading.img_start.") + Twine(T)); auto *ImgEnd = new llvm::GlobalVariable( M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, /*Initializer=*/0, Twine(".omp_offloading.img_end.") + Twine(T)); llvm::Constant *Dev = llvm::ConstantStruct::get(DeviceImageTy, ImgBegin, ImgEnd, HostEntriesBegin, HostEntriesEnd, nullptr); DeviceImagesEntires.push_back(Dev); } // Create device images global array. llvm::ArrayType *DeviceImagesInitTy = llvm::ArrayType::get(DeviceImageTy, DeviceImagesEntires.size()); llvm::Constant *DeviceImagesInit = llvm::ConstantArray::get(DeviceImagesInitTy, DeviceImagesEntires); llvm::GlobalVariable *DeviceImages = new llvm::GlobalVariable( M, DeviceImagesInitTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, DeviceImagesInit, ".omp_offloading.device_images"); DeviceImages->setUnnamedAddr(true); // This is a Zero array to be used in the creation of the constant expressions llvm::Constant *Index[] = {llvm::Constant::getNullValue(CGM.Int32Ty), llvm::Constant::getNullValue(CGM.Int32Ty)}; // Create the target region descriptor. auto *BinaryDescriptorTy = cast( CGM.getTypes().ConvertTypeForMem(getTgtBinaryDescriptorQTy())); llvm::Constant *TargetRegionsDescriptorInit = llvm::ConstantStruct::get( BinaryDescriptorTy, llvm::ConstantInt::get(CGM.Int32Ty, Devices.size()), llvm::ConstantExpr::getGetElementPtr(DeviceImagesInitTy, DeviceImages, Index), HostEntriesBegin, HostEntriesEnd, nullptr); auto *Desc = new llvm::GlobalVariable( M, BinaryDescriptorTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, TargetRegionsDescriptorInit, ".omp_offloading.descriptor"); // Emit code to register or unregister the descriptor at execution // startup or closing, respectively. // Create a variable to drive the registration and unregistration of the // descriptor, so we can reuse the logic that emits Ctors and Dtors. auto *IdentInfo = &C.Idents.get(".omp_offloading.reg_unreg_var"); ImplicitParamDecl RegUnregVar(C, C.getTranslationUnitDecl(), SourceLocation(), IdentInfo, C.CharTy); auto *UnRegFn = createOffloadingBinaryDescriptorFunction( CGM, ".omp_offloading.descriptor_unreg", [&](CodeGenFunction &CGF) { CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_unregister_lib), Desc); }); auto *RegFn = createOffloadingBinaryDescriptorFunction( CGM, ".omp_offloading.descriptor_reg", [&](CodeGenFunction &CGF) { CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_register_lib), Desc); CGM.getCXXABI().registerGlobalDtor(CGF, RegUnregVar, UnRegFn, Desc); }); return RegFn; } void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *Addr, StringRef Name, uint64_t Size) { auto *TgtOffloadEntryType = cast( CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy())); llvm::LLVMContext &C = CGM.getModule().getContext(); llvm::Module &M = CGM.getModule(); // Make sure the address has the right type. llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(Addr, CGM.VoidPtrTy); // Create constant string with the name. llvm::Constant *StrPtrInit = llvm::ConstantDataArray::getString(C, Name); llvm::GlobalVariable *Str = new llvm::GlobalVariable(M, StrPtrInit->getType(), /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, StrPtrInit, ".omp_offloading.entry_name"); Str->setUnnamedAddr(true); llvm::Constant *StrPtr = llvm::ConstantExpr::getBitCast(Str, CGM.Int8PtrTy); // Create the entry struct. llvm::Constant *EntryInit = llvm::ConstantStruct::get( TgtOffloadEntryType, AddrPtr, StrPtr, llvm::ConstantInt::get(CGM.SizeTy, Size), nullptr); llvm::GlobalVariable *Entry = new llvm::GlobalVariable( M, TgtOffloadEntryType, true, llvm::GlobalValue::ExternalLinkage, EntryInit, ".omp_offloading.entry"); // The entry has to be created in the section the linker expects it to be. Entry->setSection(".omp_offloading.entries"); // We can't have any padding between symbols, so we need to have 1-byte // alignment. Entry->setAlignment(1); return; } void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() { // Emit the offloading entries and metadata so that the device codegen side // can // easily figure out what to emit. The produced metadata looks like this: // // !omp_offload.info = !{!1, ...} // // Right now we only generate metadata for function that contain target // regions. // If we do not have entries, we dont need to do anything. if (OffloadEntriesInfoManager.empty()) return; llvm::Module &M = CGM.getModule(); llvm::LLVMContext &C = M.getContext(); SmallVector OrderedEntries(OffloadEntriesInfoManager.size()); // Create the offloading info metadata node. llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info"); // Auxiliar methods to create metadata values and strings. auto getMDInt = [&](unsigned v) { return llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), v)); }; auto getMDString = [&](StringRef v) { return llvm::MDString::get(C, v); }; // Create function that emits metadata for each target region entry; auto &&TargetRegionMetadataEmitter = [&]( unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned Line, unsigned Column, OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion &E) { llvm::SmallVector Ops; // Generate metadata for target regions. Each entry of this metadata // contains: // - Entry 0 -> Kind of this type of metadata (0). // - Entry 1 -> Device ID of the file where the entry was identified. // - Entry 2 -> File ID of the file where the entry was identified. // - Entry 3 -> Mangled name of the function where the entry was identified. // - Entry 4 -> Line in the file where the entry was identified. // - Entry 5 -> Column in the file where the entry was identified. // - Entry 6 -> Order the entry was created. // The first element of the metadata node is the kind. Ops.push_back(getMDInt(E.getKind())); Ops.push_back(getMDInt(DeviceID)); Ops.push_back(getMDInt(FileID)); Ops.push_back(getMDString(ParentName)); Ops.push_back(getMDInt(Line)); Ops.push_back(getMDInt(Column)); Ops.push_back(getMDInt(E.getOrder())); // Save this entry in the right position of the ordered entries array. OrderedEntries[E.getOrder()] = &E; // Add metadata to the named metadata node. MD->addOperand(llvm::MDNode::get(C, Ops)); }; OffloadEntriesInfoManager.actOnTargetRegionEntriesInfo( TargetRegionMetadataEmitter); for (auto *E : OrderedEntries) { assert(E && "All ordered entries must exist!"); if (auto *CE = dyn_cast( E)) { assert(CE->getID() && CE->getAddress() && "Entry ID and Addr are invalid!"); createOffloadEntry(CE->getID(), CE->getAddress()->getName(), /*Size=*/0); } else llvm_unreachable("Unsupported entry kind."); } } /// \brief Loads all the offload entries information from the host IR /// metadata. void CGOpenMPRuntime::loadOffloadInfoMetadata() { // If we are in target mode, load the metadata from the host IR. This code has // to match the metadaata creation in createOffloadEntriesAndInfoMetadata(). if (!CGM.getLangOpts().OpenMPIsDevice) return; if (CGM.getLangOpts().OMPHostIRFile.empty()) return; auto Buf = llvm::MemoryBuffer::getFile(CGM.getLangOpts().OMPHostIRFile); if (Buf.getError()) return; llvm::LLVMContext C; auto ME = llvm::parseBitcodeFile(Buf.get()->getMemBufferRef(), C); if (ME.getError()) return; llvm::NamedMDNode *MD = ME.get()->getNamedMetadata("omp_offload.info"); if (!MD) return; for (auto I : MD->operands()) { llvm::MDNode *MN = cast(I); auto getMDInt = [&](unsigned Idx) { llvm::ConstantAsMetadata *V = cast(MN->getOperand(Idx)); return cast(V->getValue())->getZExtValue(); }; auto getMDString = [&](unsigned Idx) { llvm::MDString *V = cast(MN->getOperand(Idx)); return V->getString(); }; switch (getMDInt(0)) { default: llvm_unreachable("Unexpected metadata!"); break; case OffloadEntriesInfoManagerTy::OffloadEntryInfo:: OFFLOAD_ENTRY_INFO_TARGET_REGION: OffloadEntriesInfoManager.initializeTargetRegionEntryInfo( /*DeviceID=*/getMDInt(1), /*FileID=*/getMDInt(2), /*ParentName=*/getMDString(3), /*Line=*/getMDInt(4), /*Column=*/getMDInt(5), /*Order=*/getMDInt(6)); break; } } } void CGOpenMPRuntime::emitKmpRoutineEntryT(QualType KmpInt32Ty) { if (!KmpRoutineEntryPtrTy) { // Build typedef kmp_int32 (* kmp_routine_entry_t)(kmp_int32, void *); type. auto &C = CGM.getContext(); QualType KmpRoutineEntryTyArgs[] = {KmpInt32Ty, C.VoidPtrTy}; FunctionProtoType::ExtProtoInfo EPI; KmpRoutineEntryPtrQTy = C.getPointerType( C.getFunctionType(KmpInt32Ty, KmpRoutineEntryTyArgs, EPI)); KmpRoutineEntryPtrTy = CGM.getTypes().ConvertType(KmpRoutineEntryPtrQTy); } } static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, QualType FieldTy) { auto *Field = FieldDecl::Create( C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy, C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()), /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit); Field->setAccess(AS_public); DC->addDecl(Field); return Field; } QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() { // Make sure the type of the entry is already created. This is the type we // have to create: // struct __tgt_offload_entry{ // void *addr; // Pointer to the offload entry info. // // (function or global) // char *name; // Name of the function or global. // size_t size; // Size of the entry info (0 if it a function). // }; if (TgtOffloadEntryQTy.isNull()) { ASTContext &C = CGM.getContext(); auto *RD = C.buildImplicitRecord("__tgt_offload_entry"); RD->startDefinition(); addFieldToRecordDecl(C, RD, C.VoidPtrTy); addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy)); addFieldToRecordDecl(C, RD, C.getSizeType()); RD->completeDefinition(); TgtOffloadEntryQTy = C.getRecordType(RD); } return TgtOffloadEntryQTy; } QualType CGOpenMPRuntime::getTgtDeviceImageQTy() { // These are the types we need to build: // struct __tgt_device_image{ // void *ImageStart; // Pointer to the target code start. // void *ImageEnd; // Pointer to the target code end. // // We also add the host entries to the device image, as it may be useful // // for the target runtime to have access to that information. // __tgt_offload_entry *EntriesBegin; // Begin of the table with all // // the entries. // __tgt_offload_entry *EntriesEnd; // End of the table with all the // // entries (non inclusive). // }; if (TgtDeviceImageQTy.isNull()) { ASTContext &C = CGM.getContext(); auto *RD = C.buildImplicitRecord("__tgt_device_image"); RD->startDefinition(); addFieldToRecordDecl(C, RD, C.VoidPtrTy); addFieldToRecordDecl(C, RD, C.VoidPtrTy); addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy())); addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy())); RD->completeDefinition(); TgtDeviceImageQTy = C.getRecordType(RD); } return TgtDeviceImageQTy; } QualType CGOpenMPRuntime::getTgtBinaryDescriptorQTy() { // struct __tgt_bin_desc{ // int32_t NumDevices; // Number of devices supported. // __tgt_device_image *DeviceImages; // Arrays of device images // // (one per device). // __tgt_offload_entry *EntriesBegin; // Begin of the table with all the // // entries. // __tgt_offload_entry *EntriesEnd; // End of the table with all the // // entries (non inclusive). // }; if (TgtBinaryDescriptorQTy.isNull()) { ASTContext &C = CGM.getContext(); auto *RD = C.buildImplicitRecord("__tgt_bin_desc"); RD->startDefinition(); addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); addFieldToRecordDecl(C, RD, C.getPointerType(getTgtDeviceImageQTy())); addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy())); addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy())); RD->completeDefinition(); TgtBinaryDescriptorQTy = C.getRecordType(RD); } return TgtBinaryDescriptorQTy; } namespace { struct PrivateHelpersTy { PrivateHelpersTy(const VarDecl *Original, const VarDecl *PrivateCopy, const VarDecl *PrivateElemInit) : Original(Original), PrivateCopy(PrivateCopy), PrivateElemInit(PrivateElemInit) {} const VarDecl *Original; const VarDecl *PrivateCopy; const VarDecl *PrivateElemInit; }; typedef std::pair PrivateDataTy; } // anonymous namespace static RecordDecl * createPrivatesRecordDecl(CodeGenModule &CGM, ArrayRef Privates) { if (!Privates.empty()) { auto &C = CGM.getContext(); // Build struct .kmp_privates_t. { // /* private vars */ // }; auto *RD = C.buildImplicitRecord(".kmp_privates.t"); RD->startDefinition(); for (auto &&Pair : Privates) { auto *VD = Pair.second.Original; auto Type = VD->getType(); Type = Type.getNonReferenceType(); auto *FD = addFieldToRecordDecl(C, RD, Type); if (VD->hasAttrs()) { for (specific_attr_iterator I(VD->getAttrs().begin()), E(VD->getAttrs().end()); I != E; ++I) FD->addAttr(*I); } } RD->completeDefinition(); return RD; } return nullptr; } static RecordDecl * createKmpTaskTRecordDecl(CodeGenModule &CGM, QualType KmpInt32Ty, QualType KmpRoutineEntryPointerQTy) { auto &C = CGM.getContext(); // Build struct kmp_task_t { // void * shareds; // kmp_routine_entry_t routine; // kmp_int32 part_id; // kmp_routine_entry_t destructors; // }; auto *RD = C.buildImplicitRecord("kmp_task_t"); RD->startDefinition(); addFieldToRecordDecl(C, RD, C.VoidPtrTy); addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy); addFieldToRecordDecl(C, RD, KmpInt32Ty); addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy); RD->completeDefinition(); return RD; } static RecordDecl * createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy, ArrayRef Privates) { auto &C = CGM.getContext(); // Build struct kmp_task_t_with_privates { // kmp_task_t task_data; // .kmp_privates_t. privates; // }; auto *RD = C.buildImplicitRecord("kmp_task_t_with_privates"); RD->startDefinition(); addFieldToRecordDecl(C, RD, KmpTaskTQTy); if (auto *PrivateRD = createPrivatesRecordDecl(CGM, Privates)) { addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD)); } RD->completeDefinition(); return RD; } /// \brief Emit a proxy function which accepts kmp_task_t as the second /// argument. /// \code /// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) { /// TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, /// tt->shareds); /// return 0; /// } /// \endcode static llvm::Value * emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc, QualType KmpInt32Ty, QualType KmpTaskTWithPrivatesPtrQTy, QualType KmpTaskTWithPrivatesQTy, QualType KmpTaskTQTy, QualType SharedsPtrTy, llvm::Value *TaskFunction, llvm::Value *TaskPrivatesMap) { auto &C = CGM.getContext(); FunctionArgList Args; ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty); ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy.withRestrict()); Args.push_back(&GtidArg); Args.push_back(&TaskTypeArg); FunctionType::ExtInfo Info; auto &TaskEntryFnInfo = CGM.getTypes().arrangeFreeFunctionDeclaration(KmpInt32Ty, Args, Info, /*isVariadic=*/false); auto *TaskEntryTy = CGM.getTypes().GetFunctionType(TaskEntryFnInfo); auto *TaskEntry = llvm::Function::Create(TaskEntryTy, llvm::GlobalValue::InternalLinkage, ".omp_task_entry.", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskEntry, TaskEntryFnInfo); CodeGenFunction CGF(CGM); CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args); // TaskFunction(gtid, tt->task_data.part_id, &tt->privates, task_privates_map, // tt->task_data.shareds); auto *GtidParam = CGF.EmitLoadOfScalar( CGF.GetAddrOfLocalVar(&GtidArg), /*Volatile=*/false, KmpInt32Ty, Loc); LValue TDBase = emitLoadOfPointerLValue( CGF, CGF.GetAddrOfLocalVar(&TaskTypeArg), KmpTaskTWithPrivatesPtrQTy); auto *KmpTaskTWithPrivatesQTyRD = cast(KmpTaskTWithPrivatesQTy->getAsTagDecl()); LValue Base = CGF.EmitLValueForField(TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin()); auto *KmpTaskTQTyRD = cast(KmpTaskTQTy->getAsTagDecl()); auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId); auto PartIdLVal = CGF.EmitLValueForField(Base, *PartIdFI); auto *PartidParam = CGF.EmitLoadOfLValue(PartIdLVal, Loc).getScalarVal(); auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds); auto SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI); auto *SharedsParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.EmitLoadOfLValue(SharedsLVal, Loc).getScalarVal(), CGF.ConvertTypeForMem(SharedsPtrTy)); auto PrivatesFI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin(), 1); llvm::Value *PrivatesParam; if (PrivatesFI != KmpTaskTWithPrivatesQTyRD->field_end()) { auto PrivatesLVal = CGF.EmitLValueForField(TDBase, *PrivatesFI); PrivatesParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( PrivatesLVal.getPointer(), CGF.VoidPtrTy); } else { PrivatesParam = llvm::ConstantPointerNull::get(CGF.VoidPtrTy); } llvm::Value *CallArgs[] = {GtidParam, PartidParam, PrivatesParam, TaskPrivatesMap, SharedsParam}; CGF.EmitCallOrInvoke(TaskFunction, CallArgs); CGF.EmitStoreThroughLValue( RValue::get(CGF.Builder.getInt32(/*C=*/0)), CGF.MakeAddrLValue(CGF.ReturnValue, KmpInt32Ty)); CGF.FinishFunction(); return TaskEntry; } static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM, SourceLocation Loc, QualType KmpInt32Ty, QualType KmpTaskTWithPrivatesPtrQTy, QualType KmpTaskTWithPrivatesQTy) { auto &C = CGM.getContext(); FunctionArgList Args; ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty); ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy.withRestrict()); Args.push_back(&GtidArg); Args.push_back(&TaskTypeArg); FunctionType::ExtInfo Info; auto &DestructorFnInfo = CGM.getTypes().arrangeFreeFunctionDeclaration(KmpInt32Ty, Args, Info, /*isVariadic=*/false); auto *DestructorFnTy = CGM.getTypes().GetFunctionType(DestructorFnInfo); auto *DestructorFn = llvm::Function::Create(DestructorFnTy, llvm::GlobalValue::InternalLinkage, ".omp_task_destructor.", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, DestructorFn, DestructorFnInfo); CodeGenFunction CGF(CGM); CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), KmpInt32Ty, DestructorFn, DestructorFnInfo, Args); LValue Base = emitLoadOfPointerLValue( CGF, CGF.GetAddrOfLocalVar(&TaskTypeArg), KmpTaskTWithPrivatesPtrQTy); auto *KmpTaskTWithPrivatesQTyRD = cast(KmpTaskTWithPrivatesQTy->getAsTagDecl()); auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin()); Base = CGF.EmitLValueForField(Base, *FI); for (auto *Field : cast(FI->getType()->getAsTagDecl())->fields()) { if (auto DtorKind = Field->getType().isDestructedType()) { auto FieldLValue = CGF.EmitLValueForField(Base, Field); CGF.pushDestroy(DtorKind, FieldLValue.getAddress(), Field->getType()); } } CGF.FinishFunction(); return DestructorFn; } /// \brief Emit a privates mapping function for correct handling of private and /// firstprivate variables. /// \code /// void .omp_task_privates_map.(const .privates. *noalias privs, /// **noalias priv1,..., **noalias privn) { /// *priv1 = &.privates.priv1; /// ...; /// *privn = &.privates.privn; /// } /// \endcode static llvm::Value * emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc, ArrayRef PrivateVars, ArrayRef FirstprivateVars, QualType PrivatesQTy, ArrayRef Privates) { auto &C = CGM.getContext(); FunctionArgList Args; ImplicitParamDecl TaskPrivatesArg( C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.getPointerType(PrivatesQTy).withConst().withRestrict()); Args.push_back(&TaskPrivatesArg); llvm::DenseMap PrivateVarsPos; unsigned Counter = 1; for (auto *E: PrivateVars) { Args.push_back(ImplicitParamDecl::Create( C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType())) .withConst() .withRestrict())); auto *VD = cast(cast(E)->getDecl()); PrivateVarsPos[VD] = Counter; ++Counter; } for (auto *E : FirstprivateVars) { Args.push_back(ImplicitParamDecl::Create( C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType())) .withConst() .withRestrict())); auto *VD = cast(cast(E)->getDecl()); PrivateVarsPos[VD] = Counter; ++Counter; } FunctionType::ExtInfo Info; auto &TaskPrivatesMapFnInfo = CGM.getTypes().arrangeFreeFunctionDeclaration(C.VoidTy, Args, Info, /*isVariadic=*/false); auto *TaskPrivatesMapTy = CGM.getTypes().GetFunctionType(TaskPrivatesMapFnInfo); auto *TaskPrivatesMap = llvm::Function::Create( TaskPrivatesMapTy, llvm::GlobalValue::InternalLinkage, ".omp_task_privates_map.", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskPrivatesMap, TaskPrivatesMapFnInfo); TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline); CodeGenFunction CGF(CGM); CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskPrivatesMap, TaskPrivatesMapFnInfo, Args); // *privi = &.privates.privi; LValue Base = emitLoadOfPointerLValue( CGF, CGF.GetAddrOfLocalVar(&TaskPrivatesArg), TaskPrivatesArg.getType()); auto *PrivatesQTyRD = cast(PrivatesQTy->getAsTagDecl()); Counter = 0; for (auto *Field : PrivatesQTyRD->fields()) { auto FieldLVal = CGF.EmitLValueForField(Base, Field); auto *VD = Args[PrivateVarsPos[Privates[Counter].second.Original]]; auto RefLVal = CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); auto RefLoadLVal = emitLoadOfPointerLValue(CGF, RefLVal.getAddress(), RefLVal.getType()); CGF.EmitStoreOfScalar(FieldLVal.getPointer(), RefLoadLVal); ++Counter; } CGF.FinishFunction(); return TaskPrivatesMap; } static int array_pod_sort_comparator(const PrivateDataTy *P1, const PrivateDataTy *P2) { return P1->first < P2->first ? 1 : (P2->first < P1->first ? -1 : 0); } void CGOpenMPRuntime::emitTaskCall( CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D, bool Tied, llvm::PointerIntPair Final, llvm::Value *TaskFunction, QualType SharedsTy, Address Shareds, const Expr *IfCond, ArrayRef PrivateVars, ArrayRef PrivateCopies, ArrayRef FirstprivateVars, ArrayRef FirstprivateCopies, ArrayRef FirstprivateInits, ArrayRef> Dependences) { if (!CGF.HaveInsertPoint()) return; auto &C = CGM.getContext(); llvm::SmallVector Privates; // Aggregate privates and sort them by the alignment. auto I = PrivateCopies.begin(); for (auto *E : PrivateVars) { auto *VD = cast(cast(E)->getDecl()); Privates.push_back(std::make_pair( C.getDeclAlign(VD), PrivateHelpersTy(VD, cast(cast(*I)->getDecl()), /*PrivateElemInit=*/nullptr))); ++I; } I = FirstprivateCopies.begin(); auto IElemInitRef = FirstprivateInits.begin(); for (auto *E : FirstprivateVars) { auto *VD = cast(cast(E)->getDecl()); Privates.push_back(std::make_pair( C.getDeclAlign(VD), PrivateHelpersTy( VD, cast(cast(*I)->getDecl()), cast(cast(*IElemInitRef)->getDecl())))); ++I, ++IElemInitRef; } llvm::array_pod_sort(Privates.begin(), Privates.end(), array_pod_sort_comparator); auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1); // Build type kmp_routine_entry_t (if not built yet). emitKmpRoutineEntryT(KmpInt32Ty); // Build type kmp_task_t (if not built yet). if (KmpTaskTQTy.isNull()) { KmpTaskTQTy = C.getRecordType( createKmpTaskTRecordDecl(CGM, KmpInt32Ty, KmpRoutineEntryPtrQTy)); } auto *KmpTaskTQTyRD = cast(KmpTaskTQTy->getAsTagDecl()); // Build particular struct kmp_task_t for the given task. auto *KmpTaskTWithPrivatesQTyRD = createKmpTaskTWithPrivatesRecordDecl(CGM, KmpTaskTQTy, Privates); auto KmpTaskTWithPrivatesQTy = C.getRecordType(KmpTaskTWithPrivatesQTyRD); QualType KmpTaskTWithPrivatesPtrQTy = C.getPointerType(KmpTaskTWithPrivatesQTy); auto *KmpTaskTWithPrivatesTy = CGF.ConvertType(KmpTaskTWithPrivatesQTy); auto *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo(); auto *KmpTaskTWithPrivatesTySize = getTypeSize(CGF, KmpTaskTWithPrivatesQTy); QualType SharedsPtrTy = C.getPointerType(SharedsTy); // Emit initial values for private copies (if any). llvm::Value *TaskPrivatesMap = nullptr; auto *TaskPrivatesMapTy = std::next(cast(TaskFunction)->getArgumentList().begin(), 3) ->getType(); if (!Privates.empty()) { auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin()); TaskPrivatesMap = emitTaskPrivateMappingFunction( CGM, Loc, PrivateVars, FirstprivateVars, FI->getType(), Privates); TaskPrivatesMap = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( TaskPrivatesMap, TaskPrivatesMapTy); } else { TaskPrivatesMap = llvm::ConstantPointerNull::get( cast(TaskPrivatesMapTy)); } // Build a proxy function kmp_int32 .omp_task_entry.(kmp_int32 gtid, // kmp_task_t *tt); auto *TaskEntry = emitProxyTaskFunction( CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction, TaskPrivatesMap); // Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid, // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, // kmp_routine_entry_t *task_entry); // Task flags. Format is taken from // http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h, // description of kmp_tasking_flags struct. const unsigned TiedFlag = 0x1; const unsigned FinalFlag = 0x2; unsigned Flags = Tied ? TiedFlag : 0; auto *TaskFlags = Final.getPointer() ? CGF.Builder.CreateSelect(Final.getPointer(), CGF.Builder.getInt32(FinalFlag), CGF.Builder.getInt32(/*C=*/0)) : CGF.Builder.getInt32(Final.getInt() ? FinalFlag : 0); TaskFlags = CGF.Builder.CreateOr(TaskFlags, CGF.Builder.getInt32(Flags)); auto *SharedsSize = CGM.getSize(C.getTypeSizeInChars(SharedsTy)); llvm::Value *AllocArgs[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), TaskFlags, KmpTaskTWithPrivatesTySize, SharedsSize, CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( TaskEntry, KmpRoutineEntryPtrTy)}; auto *NewTask = CGF.EmitRuntimeCall( createRuntimeFunction(OMPRTL__kmpc_omp_task_alloc), AllocArgs); auto *NewTaskNewTaskTTy = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( NewTask, KmpTaskTWithPrivatesPtrTy); LValue Base = CGF.MakeNaturalAlignAddrLValue(NewTaskNewTaskTTy, KmpTaskTWithPrivatesQTy); LValue TDBase = CGF.EmitLValueForField(Base, *KmpTaskTWithPrivatesQTyRD->field_begin()); // Fill the data in the resulting kmp_task_t record. // Copy shareds if there are any. Address KmpTaskSharedsPtr = Address::invalid(); if (!SharedsTy->getAsStructureType()->getDecl()->field_empty()) { KmpTaskSharedsPtr = Address(CGF.EmitLoadOfScalar( CGF.EmitLValueForField( TDBase, *std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds)), Loc), CGF.getNaturalTypeAlignment(SharedsTy)); CGF.EmitAggregateCopy(KmpTaskSharedsPtr, Shareds, SharedsTy); } // Emit initial values for private copies (if any). bool NeedsCleanup = false; if (!Privates.empty()) { auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin()); auto PrivatesBase = CGF.EmitLValueForField(Base, *FI); FI = cast(FI->getType()->getAsTagDecl())->field_begin(); LValue SharedsBase; if (!FirstprivateVars.empty()) { SharedsBase = CGF.MakeAddrLValue( CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)), SharedsTy); } CodeGenFunction::CGCapturedStmtInfo CapturesInfo( cast(*D.getAssociatedStmt())); for (auto &&Pair : Privates) { auto *VD = Pair.second.PrivateCopy; auto *Init = VD->getAnyInitializer(); LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI); if (Init) { if (auto *Elem = Pair.second.PrivateElemInit) { auto *OriginalVD = Pair.second.Original; auto *SharedField = CapturesInfo.lookup(OriginalVD); auto SharedRefLValue = CGF.EmitLValueForField(SharedsBase, SharedField); SharedRefLValue = CGF.MakeAddrLValue( Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)), SharedRefLValue.getType(), AlignmentSource::Decl); QualType Type = OriginalVD->getType(); if (Type->isArrayType()) { // Initialize firstprivate array. if (!isa(Init) || CGF.isTrivialInitializer(Init)) { // Perform simple memcpy. CGF.EmitAggregateAssign(PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type); } else { // Initialize firstprivate array using element-by-element // intialization. CGF.EmitOMPAggregateAssign( PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type, [&CGF, Elem, Init, &CapturesInfo]( Address DestElement, Address SrcElement) { // Clean up any temporaries needed by the initialization. CodeGenFunction::OMPPrivateScope InitScope(CGF); InitScope.addPrivate(Elem, [SrcElement]() -> Address { return SrcElement; }); (void)InitScope.Privatize(); // Emit initialization for single element. CodeGenFunction::CGCapturedStmtRAII CapInfoRAII( CGF, &CapturesInfo); CGF.EmitAnyExprToMem(Init, DestElement, Init->getType().getQualifiers(), /*IsInitializer=*/false); }); } } else { CodeGenFunction::OMPPrivateScope InitScope(CGF); InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address { return SharedRefLValue.getAddress(); }); (void)InitScope.Privatize(); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo); CGF.EmitExprAsInit(Init, VD, PrivateLValue, /*capturedByInit=*/false); } } else { CGF.EmitExprAsInit(Init, VD, PrivateLValue, /*capturedByInit=*/false); } } NeedsCleanup = NeedsCleanup || FI->getType().isDestructedType(); ++FI; } } // Provide pointer to function with destructors for privates. llvm::Value *DestructorFn = NeedsCleanup ? emitDestructorsFunction(CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTy) : llvm::ConstantPointerNull::get( cast(KmpRoutineEntryPtrTy)); LValue Destructor = CGF.EmitLValueForField( TDBase, *std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTDestructors)); CGF.EmitStoreOfScalar(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( DestructorFn, KmpRoutineEntryPtrTy), Destructor); // Process list of dependences. Address DependenciesArray = Address::invalid(); unsigned NumDependencies = Dependences.size(); if (NumDependencies) { // Dependence kind for RTL. enum RTLDependenceKindTy { DepIn = 0x01, DepInOut = 0x3 }; enum RTLDependInfoFieldsTy { BaseAddr, Len, Flags }; RecordDecl *KmpDependInfoRD; QualType FlagsTy = C.getIntTypeForBitwidth(C.getTypeSize(C.BoolTy), /*Signed=*/false); llvm::Type *LLVMFlagsTy = CGF.ConvertTypeForMem(FlagsTy); if (KmpDependInfoTy.isNull()) { KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info"); KmpDependInfoRD->startDefinition(); addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType()); addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType()); addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy); KmpDependInfoRD->completeDefinition(); KmpDependInfoTy = C.getRecordType(KmpDependInfoRD); } else { KmpDependInfoRD = cast(KmpDependInfoTy->getAsTagDecl()); } CharUnits DependencySize = C.getTypeSizeInChars(KmpDependInfoTy); // Define type kmp_depend_info[]; QualType KmpDependInfoArrayTy = C.getConstantArrayType( KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies), ArrayType::Normal, /*IndexTypeQuals=*/0); // kmp_depend_info[] deps; DependenciesArray = CGF.CreateMemTemp(KmpDependInfoArrayTy); for (unsigned i = 0; i < NumDependencies; ++i) { const Expr *E = Dependences[i].second; auto Addr = CGF.EmitLValue(E); llvm::Value *Size; QualType Ty = E->getType(); if (auto *ASE = dyn_cast(E->IgnoreParenImpCasts())) { LValue UpAddrLVal = CGF.EmitOMPArraySectionExpr(ASE, /*LowerBound=*/false); llvm::Value *UpAddr = CGF.Builder.CreateConstGEP1_32(UpAddrLVal.getPointer(), /*Idx0=*/1); llvm::Value *LowIntPtr = CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGM.SizeTy); llvm::Value *UpIntPtr = CGF.Builder.CreatePtrToInt(UpAddr, CGM.SizeTy); Size = CGF.Builder.CreateNUWSub(UpIntPtr, LowIntPtr); } else Size = getTypeSize(CGF, Ty); auto Base = CGF.MakeAddrLValue( CGF.Builder.CreateConstArrayGEP(DependenciesArray, i, DependencySize), KmpDependInfoTy); // deps[i].base_addr = &; auto BaseAddrLVal = CGF.EmitLValueForField( Base, *std::next(KmpDependInfoRD->field_begin(), BaseAddr)); CGF.EmitStoreOfScalar( CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGF.IntPtrTy), BaseAddrLVal); // deps[i].len = sizeof(); auto LenLVal = CGF.EmitLValueForField( Base, *std::next(KmpDependInfoRD->field_begin(), Len)); CGF.EmitStoreOfScalar(Size, LenLVal); // deps[i].flags = ; RTLDependenceKindTy DepKind; switch (Dependences[i].first) { case OMPC_DEPEND_in: DepKind = DepIn; break; // Out and InOut dependencies must use the same code. case OMPC_DEPEND_out: case OMPC_DEPEND_inout: DepKind = DepInOut; break; case OMPC_DEPEND_source: case OMPC_DEPEND_sink: case OMPC_DEPEND_unknown: llvm_unreachable("Unknown task dependence type"); } auto FlagsLVal = CGF.EmitLValueForField( Base, *std::next(KmpDependInfoRD->field_begin(), Flags)); CGF.EmitStoreOfScalar(llvm::ConstantInt::get(LLVMFlagsTy, DepKind), FlagsLVal); } DependenciesArray = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.Builder.CreateStructGEP(DependenciesArray, 0, CharUnits::Zero()), CGF.VoidPtrTy); } // NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc() // libcall. // Build kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t // *new_task); // Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid, // kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list, // kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence // list is not empty auto *ThreadID = getThreadID(CGF, Loc); auto *UpLoc = emitUpdateLocation(CGF, Loc); llvm::Value *TaskArgs[] = { UpLoc, ThreadID, NewTask }; llvm::Value *DepTaskArgs[7]; if (NumDependencies) { DepTaskArgs[0] = UpLoc; DepTaskArgs[1] = ThreadID; DepTaskArgs[2] = NewTask; DepTaskArgs[3] = CGF.Builder.getInt32(NumDependencies); DepTaskArgs[4] = DependenciesArray.getPointer(); DepTaskArgs[5] = CGF.Builder.getInt32(0); DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy); } auto &&ThenCodeGen = [this, NumDependencies, &TaskArgs, &DepTaskArgs](CodeGenFunction &CGF) { // TODO: add check for untied tasks. if (NumDependencies) { CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task_with_deps), DepTaskArgs); } else { CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task), TaskArgs); } }; typedef CallEndCleanup::value> IfCallEndCleanup; llvm::Value *DepWaitTaskArgs[6]; if (NumDependencies) { DepWaitTaskArgs[0] = UpLoc; DepWaitTaskArgs[1] = ThreadID; DepWaitTaskArgs[2] = CGF.Builder.getInt32(NumDependencies); DepWaitTaskArgs[3] = DependenciesArray.getPointer(); DepWaitTaskArgs[4] = CGF.Builder.getInt32(0); DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy); } auto &&ElseCodeGen = [this, &TaskArgs, ThreadID, NewTaskNewTaskTTy, TaskEntry, NumDependencies, &DepWaitTaskArgs](CodeGenFunction &CGF) { CodeGenFunction::RunCleanupsScope LocalScope(CGF); // Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid, // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32 // ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info // is specified. if (NumDependencies) CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_wait_deps), DepWaitTaskArgs); // Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid, // kmp_task_t *new_task); CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task_begin_if0), TaskArgs); // Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid, // kmp_task_t *new_task); CGF.EHStack.pushCleanup( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_omp_task_complete_if0), llvm::makeArrayRef(TaskArgs)); // Call proxy_task_entry(gtid, new_task); llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy}; CGF.EmitCallOrInvoke(TaskEntry, OutlinedFnArgs); }; if (IfCond) { emitOMPIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen); } else { CodeGenFunction::RunCleanupsScope Scope(CGF); ThenCodeGen(CGF); } } /// \brief Emit reduction operation for each element of array (required for /// array sections) LHS op = RHS. /// \param Type Type of array. /// \param LHSVar Variable on the left side of the reduction operation /// (references element of array in original variable). /// \param RHSVar Variable on the right side of the reduction operation /// (references element of array in original variable). /// \param RedOpGen Generator of reduction operation with use of LHSVar and /// RHSVar. static void EmitOMPAggregateReduction( CodeGenFunction &CGF, QualType Type, const VarDecl *LHSVar, const VarDecl *RHSVar, const llvm::function_ref &RedOpGen, const Expr *XExpr = nullptr, const Expr *EExpr = nullptr, const Expr *UpExpr = nullptr) { // Perform element-by-element initialization. QualType ElementTy; Address LHSAddr = CGF.GetAddrOfLocalVar(LHSVar); Address RHSAddr = CGF.GetAddrOfLocalVar(RHSVar); // Drill down to the base element type on both arrays. auto ArrayTy = Type->getAsArrayTypeUnsafe(); auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, LHSAddr); auto RHSBegin = RHSAddr.getPointer(); auto LHSBegin = LHSAddr.getPointer(); // Cast from pointer to array type to pointer to single element. auto LHSEnd = CGF.Builder.CreateGEP(LHSBegin, NumElements); // The basic structure here is a while-do loop. auto BodyBB = CGF.createBasicBlock("omp.arraycpy.body"); auto DoneBB = CGF.createBasicBlock("omp.arraycpy.done"); auto IsEmpty = CGF.Builder.CreateICmpEQ(LHSBegin, LHSEnd, "omp.arraycpy.isempty"); CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); // Enter the loop body, making that address the current address. auto EntryBB = CGF.Builder.GetInsertBlock(); CGF.EmitBlock(BodyBB); CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy); llvm::PHINode *RHSElementPHI = CGF.Builder.CreatePHI( RHSBegin->getType(), 2, "omp.arraycpy.srcElementPast"); RHSElementPHI->addIncoming(RHSBegin, EntryBB); Address RHSElementCurrent = Address(RHSElementPHI, RHSAddr.getAlignment().alignmentOfArrayElement(ElementSize)); llvm::PHINode *LHSElementPHI = CGF.Builder.CreatePHI( LHSBegin->getType(), 2, "omp.arraycpy.destElementPast"); LHSElementPHI->addIncoming(LHSBegin, EntryBB); Address LHSElementCurrent = Address(LHSElementPHI, LHSAddr.getAlignment().alignmentOfArrayElement(ElementSize)); // Emit copy. CodeGenFunction::OMPPrivateScope Scope(CGF); Scope.addPrivate(LHSVar, [=]() -> Address { return LHSElementCurrent; }); Scope.addPrivate(RHSVar, [=]() -> Address { return RHSElementCurrent; }); Scope.Privatize(); RedOpGen(CGF, XExpr, EExpr, UpExpr); Scope.ForceCleanup(); // Shift the address forward by one element. auto LHSElementNext = CGF.Builder.CreateConstGEP1_32( LHSElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element"); auto RHSElementNext = CGF.Builder.CreateConstGEP1_32( RHSElementPHI, /*Idx0=*/1, "omp.arraycpy.src.element"); // Check whether we've reached the end. auto Done = CGF.Builder.CreateICmpEQ(LHSElementNext, LHSEnd, "omp.arraycpy.done"); CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB); LHSElementPHI->addIncoming(LHSElementNext, CGF.Builder.GetInsertBlock()); RHSElementPHI->addIncoming(RHSElementNext, CGF.Builder.GetInsertBlock()); // Done. CGF.EmitBlock(DoneBB, /*IsFinished=*/true); } static llvm::Value *emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps) { auto &C = CGM.getContext(); // void reduction_func(void *LHSArg, void *RHSArg); FunctionArgList Args; ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, C.VoidPtrTy); ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, C.VoidPtrTy); Args.push_back(&LHSArg); Args.push_back(&RHSArg); FunctionType::ExtInfo EI; auto &CGFI = CGM.getTypes().arrangeFreeFunctionDeclaration( C.VoidTy, Args, EI, /*isVariadic=*/false); auto *Fn = llvm::Function::Create( CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, ".omp.reduction.reduction_func", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); CodeGenFunction CGF(CGM); CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); // Dst = (void*[n])(LHSArg); // Src = (void*[n])(RHSArg); Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)), ArgsType), CGF.getPointerAlign()); Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)), ArgsType), CGF.getPointerAlign()); // ... // *(Type*)lhs[i] = RedOp(*(Type*)lhs[i], *(Type*)rhs[i]); // ... CodeGenFunction::OMPPrivateScope Scope(CGF); auto IPriv = Privates.begin(); unsigned Idx = 0; for (unsigned I = 0, E = ReductionOps.size(); I < E; ++I, ++IPriv, ++Idx) { auto RHSVar = cast(cast(RHSExprs[I])->getDecl()); Scope.addPrivate(RHSVar, [&]() -> Address { return emitAddrOfVarFromArray(CGF, RHS, Idx, RHSVar); }); auto LHSVar = cast(cast(LHSExprs[I])->getDecl()); Scope.addPrivate(LHSVar, [&]() -> Address { return emitAddrOfVarFromArray(CGF, LHS, Idx, LHSVar); }); QualType PrivTy = (*IPriv)->getType(); if (PrivTy->isArrayType()) { // Get array size and emit VLA type. ++Idx; Address Elem = CGF.Builder.CreateConstArrayGEP(LHS, Idx, CGF.getPointerSize()); llvm::Value *Ptr = CGF.Builder.CreateLoad(Elem); CodeGenFunction::OpaqueValueMapping OpaqueMap( CGF, cast( CGF.getContext().getAsVariableArrayType(PrivTy)->getSizeExpr()), RValue::get(CGF.Builder.CreatePtrToInt(Ptr, CGF.SizeTy))); CGF.EmitVariablyModifiedType(PrivTy); } } Scope.Privatize(); IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { if ((*IPriv)->getType()->isArrayType()) { // Emit reduction for array section. auto *LHSVar = cast(cast(*ILHS)->getDecl()); auto *RHSVar = cast(cast(*IRHS)->getDecl()); EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar, [=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) { CGF.EmitIgnoredExpr(E); }); } else // Emit reduction for array subscript or single variable. CGF.EmitIgnoredExpr(E); ++IPriv, ++ILHS, ++IRHS; } Scope.ForceCleanup(); CGF.FinishFunction(); return Fn; } void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, bool WithNowait, bool SimpleReduction) { if (!CGF.HaveInsertPoint()) return; // Next code should be emitted for reduction: // // static kmp_critical_name lock = { 0 }; // // void reduce_func(void *lhs[], void *rhs[]) { // *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]); // ... // *(Type-1*)lhs[-1] = ReductionOperation-1(*(Type-1*)lhs[-1], // *(Type-1*)rhs[-1]); // } // // ... // void *RedList[] = {&[0], ..., &[-1]}; // switch (__kmpc_reduce{_nowait}(, , , sizeof(RedList), // RedList, reduce_func, &)) { // case 1: // ... // [i] = RedOp(*[i], *[i]); // ... // __kmpc_end_reduce{_nowait}(, , &); // break; // case 2: // ... // Atomic([i] = RedOp(*[i], *[i])); // ... // [__kmpc_end_reduce(, , &);] // break; // default:; // } // // if SimpleReduction is true, only the next code is generated: // ... // [i] = RedOp(*[i], *[i]); // ... auto &C = CGM.getContext(); if (SimpleReduction) { CodeGenFunction::RunCleanupsScope Scope(CGF); auto IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { if ((*IPriv)->getType()->isArrayType()) { auto *LHSVar = cast(cast(*ILHS)->getDecl()); auto *RHSVar = cast(cast(*IRHS)->getDecl()); EmitOMPAggregateReduction( CGF, (*IPriv)->getType(), LHSVar, RHSVar, [=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) { CGF.EmitIgnoredExpr(E); }); } else CGF.EmitIgnoredExpr(E); ++IPriv, ++ILHS, ++IRHS; } return; } // 1. Build a list of reduction variables. // void *RedList[] = {[0], ..., [-1]}; auto Size = RHSExprs.size(); for (auto *E : Privates) { if (E->getType()->isArrayType()) // Reserve place for array size. ++Size; } llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); QualType ReductionArrayTy = C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, /*IndexTypeQuals=*/0); Address ReductionList = CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); auto IPriv = Privates.begin(); unsigned Idx = 0; for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, CGF.getPointerSize()); CGF.Builder.CreateStore( CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), Elem); if ((*IPriv)->getType()->isArrayType()) { // Store array size. ++Idx; Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, CGF.getPointerSize()); CGF.Builder.CreateStore( CGF.Builder.CreateIntToPtr( CGF.Builder.CreateIntCast( CGF.getVLASize(CGF.getContext().getAsVariableArrayType( (*IPriv)->getType())) .first, CGF.SizeTy, /*isSigned=*/false), CGF.VoidPtrTy), Elem); } } // 2. Emit reduce_func(). auto *ReductionFn = emitReductionFunction( CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, LHSExprs, RHSExprs, ReductionOps); // 3. Create static kmp_critical_name lock = { 0 }; auto *Lock = getCriticalRegionLock(".reduction"); // 4. Build res = __kmpc_reduce{_nowait}(, , , sizeof(RedList), // RedList, reduce_func, &); auto *IdentTLoc = emitUpdateLocation( CGF, Loc, static_cast(OMP_IDENT_KMPC | OMP_ATOMIC_REDUCE)); auto *ThreadId = getThreadID(CGF, Loc); auto *ReductionArrayTySize = getTypeSize(CGF, ReductionArrayTy); auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList.getPointer(), CGF.VoidPtrTy); llvm::Value *Args[] = { IdentTLoc, // ident_t * ThreadId, // i32 CGF.Builder.getInt32(RHSExprs.size()), // i32 ReductionArrayTySize, // size_type sizeof(RedList) RL, // void *RedList ReductionFn, // void (*) (void *, void *) Lock // kmp_critical_name *& }; auto Res = CGF.EmitRuntimeCall( createRuntimeFunction(WithNowait ? OMPRTL__kmpc_reduce_nowait : OMPRTL__kmpc_reduce), Args); // 5. Build switch(res) auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/2); // 6. Build case 1: // ... // [i] = RedOp(*[i], *[i]); // ... // __kmpc_end_reduce{_nowait}(, , &); // break; auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1"); SwInst->addCase(CGF.Builder.getInt32(1), Case1BB); CGF.EmitBlock(Case1BB); { CodeGenFunction::RunCleanupsScope Scope(CGF); // Add emission of __kmpc_end_reduce{_nowait}(, , &); llvm::Value *EndArgs[] = { IdentTLoc, // ident_t * ThreadId, // i32 Lock // kmp_critical_name *& }; CGF.EHStack .pushCleanup::value>>( NormalAndEHCleanup, createRuntimeFunction(WithNowait ? OMPRTL__kmpc_end_reduce_nowait : OMPRTL__kmpc_end_reduce), llvm::makeArrayRef(EndArgs)); auto IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { if ((*IPriv)->getType()->isArrayType()) { // Emit reduction for array section. auto *LHSVar = cast(cast(*ILHS)->getDecl()); auto *RHSVar = cast(cast(*IRHS)->getDecl()); EmitOMPAggregateReduction( CGF, (*IPriv)->getType(), LHSVar, RHSVar, [=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) { CGF.EmitIgnoredExpr(E); }); } else // Emit reduction for array subscript or single variable. CGF.EmitIgnoredExpr(E); ++IPriv, ++ILHS, ++IRHS; } } CGF.EmitBranch(DefaultBB); // 7. Build case 2: // ... // Atomic([i] = RedOp(*[i], *[i])); // ... // break; auto *Case2BB = CGF.createBasicBlock(".omp.reduction.case2"); SwInst->addCase(CGF.Builder.getInt32(2), Case2BB); CGF.EmitBlock(Case2BB); { CodeGenFunction::RunCleanupsScope Scope(CGF); if (!WithNowait) { // Add emission of __kmpc_end_reduce(, , &); llvm::Value *EndArgs[] = { IdentTLoc, // ident_t * ThreadId, // i32 Lock // kmp_critical_name *& }; CGF.EHStack .pushCleanup::value>>( NormalAndEHCleanup, createRuntimeFunction(OMPRTL__kmpc_end_reduce), llvm::makeArrayRef(EndArgs)); } auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); auto IPriv = Privates.begin(); for (auto *E : ReductionOps) { const Expr *XExpr = nullptr; const Expr *EExpr = nullptr; const Expr *UpExpr = nullptr; BinaryOperatorKind BO = BO_Comma; if (auto *BO = dyn_cast(E)) { if (BO->getOpcode() == BO_Assign) { XExpr = BO->getLHS(); UpExpr = BO->getRHS(); } } // Try to emit update expression as a simple atomic. auto *RHSExpr = UpExpr; if (RHSExpr) { // Analyze RHS part of the whole expression. if (auto *ACO = dyn_cast( RHSExpr->IgnoreParenImpCasts())) { // If this is a conditional operator, analyze its condition for // min/max reduction operator. RHSExpr = ACO->getCond(); } if (auto *BORHS = dyn_cast(RHSExpr->IgnoreParenImpCasts())) { EExpr = BORHS->getRHS(); BO = BORHS->getOpcode(); } } if (XExpr) { auto *VD = cast(cast(*ILHS)->getDecl()); auto &&AtomicRedGen = [this, BO, VD, IPriv, Loc](CodeGenFunction &CGF, const Expr *XExpr, const Expr *EExpr, const Expr *UpExpr) { LValue X = CGF.EmitLValue(XExpr); RValue E; if (EExpr) E = CGF.EmitAnyExpr(EExpr); CGF.EmitOMPAtomicSimpleUpdateExpr( X, E, BO, /*IsXLHSInRHSPart=*/true, llvm::Monotonic, Loc, - [&CGF, UpExpr, VD, IPriv](RValue XRValue) { + [&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); - PrivateScope.addPrivate(VD, [&CGF, VD, XRValue]() -> Address { - Address LHSTemp = CGF.CreateMemTemp(VD->getType()); - CGF.EmitStoreThroughLValue( - XRValue, CGF.MakeAddrLValue(LHSTemp, VD->getType())); - return LHSTemp; - }); + PrivateScope.addPrivate( + VD, [&CGF, VD, XRValue, Loc]() -> Address { + Address LHSTemp = CGF.CreateMemTemp(VD->getType()); + CGF.emitOMPSimpleStore( + CGF.MakeAddrLValue(LHSTemp, VD->getType()), XRValue, + VD->getType().getNonReferenceType(), Loc); + return LHSTemp; + }); (void)PrivateScope.Privatize(); return CGF.EmitAnyExpr(UpExpr); }); }; if ((*IPriv)->getType()->isArrayType()) { // Emit atomic reduction for array section. auto *RHSVar = cast(cast(*IRHS)->getDecl()); EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), VD, RHSVar, AtomicRedGen, XExpr, EExpr, UpExpr); } else // Emit atomic reduction for array subscript or single variable. AtomicRedGen(CGF, XExpr, EExpr, UpExpr); } else { // Emit as a critical region. auto &&CritRedGen = [this, E, Loc](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) { emitCriticalRegion( CGF, ".atomic_reduction", [E](CodeGenFunction &CGF) { CGF.EmitIgnoredExpr(E); }, Loc); }; if ((*IPriv)->getType()->isArrayType()) { auto *LHSVar = cast(cast(*ILHS)->getDecl()); auto *RHSVar = cast(cast(*IRHS)->getDecl()); EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar, CritRedGen); } else CritRedGen(CGF, nullptr, nullptr, nullptr); } ++ILHS, ++IRHS, ++IPriv; } } CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); } void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 // global_tid); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)}; // Ignore return result until untied tasks are supported. CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskwait), Args); } void CGOpenMPRuntime::emitInlinedDirective(CodeGenFunction &CGF, OpenMPDirectiveKind InnerKind, const RegionCodeGenTy &CodeGen, bool HasCancel) { if (!CGF.HaveInsertPoint()) return; InlinedOpenMPRegionRAII Region(CGF, CodeGen, InnerKind, HasCancel); CGF.CapturedStmtInfo->EmitBody(CGF, /*S=*/nullptr); } namespace { enum RTCancelKind { CancelNoreq = 0, CancelParallel = 1, CancelLoop = 2, CancelSections = 3, CancelTaskgroup = 4 }; } static RTCancelKind getCancellationKind(OpenMPDirectiveKind CancelRegion) { RTCancelKind CancelKind = CancelNoreq; if (CancelRegion == OMPD_parallel) CancelKind = CancelParallel; else if (CancelRegion == OMPD_for) CancelKind = CancelLoop; else if (CancelRegion == OMPD_sections) CancelKind = CancelSections; else { assert(CancelRegion == OMPD_taskgroup); CancelKind = CancelTaskgroup; } return CancelKind; } void CGOpenMPRuntime::emitCancellationPointCall( CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind CancelRegion) { if (!CGF.HaveInsertPoint()) return; // Build call kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32 // global_tid, kmp_int32 cncl_kind); if (auto *OMPRegionInfo = dyn_cast_or_null(CGF.CapturedStmtInfo)) { if (OMPRegionInfo->getDirectiveKind() == OMPD_single) return; if (OMPRegionInfo->hasCancel()) { llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), CGF.Builder.getInt32(getCancellationKind(CancelRegion))}; // Ignore return result until untied tasks are supported. auto *Result = CGF.EmitRuntimeCall( createRuntimeFunction(OMPRTL__kmpc_cancellationpoint), Args); // if (__kmpc_cancellationpoint()) { // __kmpc_cancel_barrier(); // exit from construct; // } auto *ExitBB = CGF.createBasicBlock(".cancel.exit"); auto *ContBB = CGF.createBasicBlock(".cancel.continue"); auto *Cmp = CGF.Builder.CreateIsNotNull(Result); CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB); CGF.EmitBlock(ExitBB); // __kmpc_cancel_barrier(); emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false); // exit from construct; auto CancelDest = CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind()); CGF.EmitBranchThroughCleanup(CancelDest); CGF.EmitBlock(ContBB, /*IsFinished=*/true); } } } void CGOpenMPRuntime::emitCancelCall(CodeGenFunction &CGF, SourceLocation Loc, const Expr *IfCond, OpenMPDirectiveKind CancelRegion) { if (!CGF.HaveInsertPoint()) return; // Build call kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid, // kmp_int32 cncl_kind); if (auto *OMPRegionInfo = dyn_cast_or_null(CGF.CapturedStmtInfo)) { if (OMPRegionInfo->getDirectiveKind() == OMPD_single) return; auto &&ThenGen = [this, Loc, CancelRegion, OMPRegionInfo](CodeGenFunction &CGF) { llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), CGF.Builder.getInt32(getCancellationKind(CancelRegion))}; // Ignore return result until untied tasks are supported. auto *Result = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_cancel), Args); // if (__kmpc_cancel()) { // __kmpc_cancel_barrier(); // exit from construct; // } auto *ExitBB = CGF.createBasicBlock(".cancel.exit"); auto *ContBB = CGF.createBasicBlock(".cancel.continue"); auto *Cmp = CGF.Builder.CreateIsNotNull(Result); CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB); CGF.EmitBlock(ExitBB); // __kmpc_cancel_barrier(); emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false); // exit from construct; auto CancelDest = CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind()); CGF.EmitBranchThroughCleanup(CancelDest); CGF.EmitBlock(ContBB, /*IsFinished=*/true); }; if (IfCond) emitOMPIfClause(CGF, IfCond, ThenGen, [](CodeGenFunction &) {}); else ThenGen(CGF); } } /// \brief Obtain information that uniquely identifies a target entry. This /// consists of the file and device IDs as well as line and column numbers /// associated with the relevant entry source location. static void getTargetEntryUniqueInfo(ASTContext &C, SourceLocation Loc, unsigned &DeviceID, unsigned &FileID, unsigned &LineNum, unsigned &ColumnNum) { auto &SM = C.getSourceManager(); // The loc should be always valid and have a file ID (the user cannot use // #pragma directives in macros) assert(Loc.isValid() && "Source location is expected to be always valid."); assert(Loc.isFileID() && "Source location is expected to refer to a file."); PresumedLoc PLoc = SM.getPresumedLoc(Loc); assert(PLoc.isValid() && "Source location is expected to be always valid."); llvm::sys::fs::UniqueID ID; if (llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID)) llvm_unreachable("Source file with target region no longer exists!"); DeviceID = ID.getDevice(); FileID = ID.getFile(); LineNum = PLoc.getLine(); ColumnNum = PLoc.getColumn(); return; } void CGOpenMPRuntime::emitTargetOutlinedFunction( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry) { assert(!ParentName.empty() && "Invalid target region parent name!"); const CapturedStmt &CS = *cast(D.getAssociatedStmt()); // Emit target region as a standalone region. auto &&CodeGen = [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS.getCapturedStmt()); }; // Create a unique name for the proxy/entry function that using the source // location information of the current target region. The name will be // something like: // // .omp_offloading.DD_FFFF.PP.lBB.cCC // // where DD_FFFF is an ID unique to the file (device and file IDs), PP is the // mangled name of the function that encloses the target region, BB is the // line number of the target region, and CC is the column number of the target // region. unsigned DeviceID; unsigned FileID; unsigned Line; unsigned Column; getTargetEntryUniqueInfo(CGM.getContext(), D.getLocStart(), DeviceID, FileID, Line, Column); SmallString<64> EntryFnName; { llvm::raw_svector_ostream OS(EntryFnName); OS << ".omp_offloading" << llvm::format(".%x", DeviceID) << llvm::format(".%x.", FileID) << ParentName << ".l" << Line << ".c" << Column; } CodeGenFunction CGF(CGM, true); CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS); // If this target outline function is not an offload entry, we don't need to // register it. if (!IsOffloadEntry) return; // The target region ID is used by the runtime library to identify the current // target region, so it only has to be unique and not necessarily point to // anything. It could be the pointer to the outlined function that implements // the target region, but we aren't using that so that the compiler doesn't // need to keep that, and could therefore inline the host function if proven // worthwhile during optimization. In the other hand, if emitting code for the // device, the ID has to be the function address so that it can retrieved from // the offloading entry and launched by the runtime library. We also mark the // outlined function to have external linkage in case we are emitting code for // the device, because these functions will be entry points to the device. if (CGM.getLangOpts().OpenMPIsDevice) { OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy); OutlinedFn->setLinkage(llvm::GlobalValue::ExternalLinkage); } else OutlinedFnID = new llvm::GlobalVariable( CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, llvm::Constant::getNullValue(CGM.Int8Ty), ".omp_offload.region_id"); // Register the information for the entry associated with this target region. OffloadEntriesInfoManager.registerTargetRegionEntryInfo( DeviceID, FileID, ParentName, Line, Column, OutlinedFn, OutlinedFnID); return; } void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, llvm::Value *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, const Expr *Device, ArrayRef CapturedVars) { if (!CGF.HaveInsertPoint()) return; /// \brief Values for bit flags used to specify the mapping type for /// offloading. enum OpenMPOffloadMappingFlags { /// \brief Allocate memory on the device and move data from host to device. OMP_MAP_TO = 0x01, /// \brief Allocate memory on the device and move data from device to host. OMP_MAP_FROM = 0x02, /// \brief The element passed to the device is a pointer. OMP_MAP_PTR = 0x20, /// \brief Pass the element to the device by value. OMP_MAP_BYCOPY = 0x80, }; enum OpenMPOffloadingReservedDeviceIDs { /// \brief Device ID if the device was not defined, runtime should get it /// from environment variables in the spec. OMP_DEVICEID_UNDEF = -1, }; assert(OutlinedFn && "Invalid outlined function!"); auto &Ctx = CGF.getContext(); // Fill up the arrays with the all the captured variables. SmallVector BasePointers; SmallVector Pointers; SmallVector Sizes; SmallVector MapTypes; bool hasVLACaptures = false; const CapturedStmt &CS = *cast(D.getAssociatedStmt()); auto RI = CS.getCapturedRecordDecl()->field_begin(); // auto II = CS.capture_init_begin(); auto CV = CapturedVars.begin(); for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(), CE = CS.capture_end(); CI != CE; ++CI, ++RI, ++CV) { StringRef Name; QualType Ty; llvm::Value *BasePointer; llvm::Value *Pointer; llvm::Value *Size; unsigned MapType; // VLA sizes are passed to the outlined region by copy. if (CI->capturesVariableArrayType()) { BasePointer = Pointer = *CV; Size = getTypeSize(CGF, RI->getType()); // Copy to the device as an argument. No need to retrieve it. MapType = OMP_MAP_BYCOPY; hasVLACaptures = true; } else if (CI->capturesThis()) { BasePointer = Pointer = *CV; const PointerType *PtrTy = cast(RI->getType().getTypePtr()); Size = getTypeSize(CGF, PtrTy->getPointeeType()); // Default map type. MapType = OMP_MAP_TO | OMP_MAP_FROM; } else if (CI->capturesVariableByCopy()) { MapType = OMP_MAP_BYCOPY; if (!RI->getType()->isAnyPointerType()) { // If the field is not a pointer, we need to save the actual value and // load it as a void pointer. auto DstAddr = CGF.CreateMemTemp( Ctx.getUIntPtrType(), Twine(CI->getCapturedVar()->getName()) + ".casted"); LValue DstLV = CGF.MakeAddrLValue(DstAddr, Ctx.getUIntPtrType()); auto *SrcAddrVal = CGF.EmitScalarConversion( DstAddr.getPointer(), Ctx.getPointerType(Ctx.getUIntPtrType()), Ctx.getPointerType(RI->getType()), SourceLocation()); LValue SrcLV = CGF.MakeNaturalAlignAddrLValue(SrcAddrVal, RI->getType()); // Store the value using the source type pointer. CGF.EmitStoreThroughLValue(RValue::get(*CV), SrcLV); // Load the value using the destination type pointer. BasePointer = Pointer = CGF.EmitLoadOfLValue(DstLV, SourceLocation()).getScalarVal(); } else { MapType |= OMP_MAP_PTR; BasePointer = Pointer = *CV; } Size = getTypeSize(CGF, RI->getType()); } else { assert(CI->capturesVariable() && "Expected captured reference."); BasePointer = Pointer = *CV; const ReferenceType *PtrTy = cast(RI->getType().getTypePtr()); QualType ElementType = PtrTy->getPointeeType(); Size = getTypeSize(CGF, ElementType); // The default map type for a scalar/complex type is 'to' because by // default the value doesn't have to be retrieved. For an aggregate type, // the default is 'tofrom'. MapType = ElementType->isAggregateType() ? (OMP_MAP_TO | OMP_MAP_FROM) : OMP_MAP_TO; if (ElementType->isAnyPointerType()) MapType |= OMP_MAP_PTR; } BasePointers.push_back(BasePointer); Pointers.push_back(Pointer); Sizes.push_back(Size); MapTypes.push_back(MapType); } // Keep track on whether the host function has to be executed. auto OffloadErrorQType = Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true); auto OffloadError = CGF.MakeAddrLValue( CGF.CreateMemTemp(OffloadErrorQType, ".run_host_version"), OffloadErrorQType); CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), OffloadError); // Fill up the pointer arrays and transfer execution to the device. auto &&ThenGen = [this, &Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes, hasVLACaptures, Device, OutlinedFnID, OffloadError, OffloadErrorQType](CodeGenFunction &CGF) { unsigned PointerNumVal = BasePointers.size(); llvm::Value *PointerNum = CGF.Builder.getInt32(PointerNumVal); llvm::Value *BasePointersArray; llvm::Value *PointersArray; llvm::Value *SizesArray; llvm::Value *MapTypesArray; if (PointerNumVal) { llvm::APInt PointerNumAP(32, PointerNumVal, /*isSigned=*/true); QualType PointerArrayType = Ctx.getConstantArrayType( Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal, /*IndexTypeQuals=*/0); BasePointersArray = CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer(); PointersArray = CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer(); // If we don't have any VLA types, we can use a constant array for the map // sizes, otherwise we need to fill up the arrays as we do for the // pointers. if (hasVLACaptures) { QualType SizeArrayType = Ctx.getConstantArrayType( Ctx.getSizeType(), PointerNumAP, ArrayType::Normal, /*IndexTypeQuals=*/0); SizesArray = CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer(); } else { // We expect all the sizes to be constant, so we collect them to create // a constant array. SmallVector ConstSizes; for (auto S : Sizes) ConstSizes.push_back(cast(S)); auto *SizesArrayInit = llvm::ConstantArray::get( llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes); auto *SizesArrayGbl = new llvm::GlobalVariable( CGM.getModule(), SizesArrayInit->getType(), /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, SizesArrayInit, ".offload_sizes"); SizesArrayGbl->setUnnamedAddr(true); SizesArray = SizesArrayGbl; } // The map types are always constant so we don't need to generate code to // fill arrays. Instead, we create an array constant. llvm::Constant *MapTypesArrayInit = llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes); auto *MapTypesArrayGbl = new llvm::GlobalVariable( CGM.getModule(), MapTypesArrayInit->getType(), /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapTypesArrayInit, ".offload_maptypes"); MapTypesArrayGbl->setUnnamedAddr(true); MapTypesArray = MapTypesArrayGbl; for (unsigned i = 0; i < PointerNumVal; ++i) { llvm::Value *BPVal = BasePointers[i]; if (BPVal->getType()->isPointerTy()) BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy); else { assert(BPVal->getType()->isIntegerTy() && "If not a pointer, the value type must be an integer."); BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy); } llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), BasePointersArray, 0, i); Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy)); CGF.Builder.CreateStore(BPVal, BPAddr); llvm::Value *PVal = Pointers[i]; if (PVal->getType()->isPointerTy()) PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy); else { assert(PVal->getType()->isIntegerTy() && "If not a pointer, the value type must be an integer."); PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy); } llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray, 0, i); Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy)); CGF.Builder.CreateStore(PVal, PAddr); if (hasVLACaptures) { llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray, /*Idx0=*/0, /*Idx1=*/i); Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType())); CGF.Builder.CreateStore(CGF.Builder.CreateIntCast( Sizes[i], CGM.SizeTy, /*isSigned=*/true), SAddr); } } BasePointersArray = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), BasePointersArray, /*Idx0=*/0, /*Idx1=*/0); PointersArray = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray, /*Idx0=*/0, /*Idx1=*/0); SizesArray = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray, /*Idx0=*/0, /*Idx1=*/0); MapTypesArray = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.Int32Ty, PointerNumVal), MapTypesArray, /*Idx0=*/0, /*Idx1=*/0); } else { BasePointersArray = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy); PointersArray = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy); SizesArray = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo()); MapTypesArray = llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()); } // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target // region, so it only has to be unique and not necessarily point to // anything. It could be the pointer to the outlined function that // implements the target region, but we aren't using that so that the // compiler doesn't need to keep that, and could therefore inline the host // function if proven worthwhile during optimization. // From this point on, we need to have an ID of the target region defined. assert(OutlinedFnID && "Invalid outlined function ID!"); // Emit device ID if any. llvm::Value *DeviceID; if (Device) DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device), CGM.Int32Ty, /*isSigned=*/true); else DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF); llvm::Value *OffloadingArgs[] = { DeviceID, OutlinedFnID, PointerNum, BasePointersArray, PointersArray, SizesArray, MapTypesArray}; auto Return = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__tgt_target), OffloadingArgs); CGF.EmitStoreOfScalar(Return, OffloadError); }; // Notify that the host version must be executed. auto &&ElseGen = [this, OffloadError, OffloadErrorQType](CodeGenFunction &CGF) { CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/-1u), OffloadError); }; // If we have a target function ID it means that we need to support // offloading, otherwise, just execute on the host. We need to execute on host // regardless of the conditional in the if clause if, e.g., the user do not // specify target triples. if (OutlinedFnID) { if (IfCond) { emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen); } else { CodeGenFunction::RunCleanupsScope Scope(CGF); ThenGen(CGF); } } else { CodeGenFunction::RunCleanupsScope Scope(CGF); ElseGen(CGF); } // Check the error code and execute the host version if required. auto OffloadFailedBlock = CGF.createBasicBlock("omp_offload.failed"); auto OffloadContBlock = CGF.createBasicBlock("omp_offload.cont"); auto OffloadErrorVal = CGF.EmitLoadOfScalar(OffloadError, SourceLocation()); auto Failed = CGF.Builder.CreateIsNotNull(OffloadErrorVal); CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); CGF.EmitBlock(OffloadFailedBlock); CGF.Builder.CreateCall(OutlinedFn, BasePointers); CGF.EmitBranch(OffloadContBlock); CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true); return; } void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S, StringRef ParentName) { if (!S) return; // If we find a OMP target directive, codegen the outline function and // register the result. // FIXME: Add other directives with target when they become supported. bool isTargetDirective = isa(S); if (isTargetDirective) { auto *E = cast(S); unsigned DeviceID; unsigned FileID; unsigned Line; unsigned Column; getTargetEntryUniqueInfo(CGM.getContext(), E->getLocStart(), DeviceID, FileID, Line, Column); // Is this a target region that should not be emitted as an entry point? If // so just signal we are done with this target region. if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo( DeviceID, FileID, ParentName, Line, Column)) return; llvm::Function *Fn; llvm::Constant *Addr; emitTargetOutlinedFunction(*E, ParentName, Fn, Addr, /*isOffloadEntry=*/true); assert(Fn && Addr && "Target region emission failed."); return; } if (const OMPExecutableDirective *E = dyn_cast(S)) { if (!E->getAssociatedStmt()) return; scanForTargetRegionsFunctions( cast(E->getAssociatedStmt())->getCapturedStmt(), ParentName); return; } // If this is a lambda function, look into its body. if (auto *L = dyn_cast(S)) S = L->getBody(); // Keep looking for target regions recursively. for (auto *II : S->children()) scanForTargetRegionsFunctions(II, ParentName); return; } bool CGOpenMPRuntime::emitTargetFunctions(GlobalDecl GD) { auto &FD = *cast(GD.getDecl()); // If emitting code for the host, we do not process FD here. Instead we do // the normal code generation. if (!CGM.getLangOpts().OpenMPIsDevice) return false; // Try to detect target regions in the function. scanForTargetRegionsFunctions(FD.getBody(), CGM.getMangledName(GD)); // We should not emit any function othen that the ones created during the // scanning. Therefore, we signal that this function is completely dealt // with. return true; } bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) { if (!CGM.getLangOpts().OpenMPIsDevice) return false; // Check if there are Ctors/Dtors in this declaration and look for target // regions in it. We use the complete variant to produce the kernel name // mangling. QualType RDTy = cast(GD.getDecl())->getType(); if (auto *RD = RDTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl()) { for (auto *Ctor : RD->ctors()) { StringRef ParentName = CGM.getMangledName(GlobalDecl(Ctor, Ctor_Complete)); scanForTargetRegionsFunctions(Ctor->getBody(), ParentName); } auto *Dtor = RD->getDestructor(); if (Dtor) { StringRef ParentName = CGM.getMangledName(GlobalDecl(Dtor, Dtor_Complete)); scanForTargetRegionsFunctions(Dtor->getBody(), ParentName); } } // If we are in target mode we do not emit any global (declare target is not // implemented yet). Therefore we signal that GD was processed in this case. return true; } bool CGOpenMPRuntime::emitTargetGlobal(GlobalDecl GD) { auto *VD = GD.getDecl(); if (isa(VD)) return emitTargetFunctions(GD); return emitTargetGlobalVariable(GD); } llvm::Function *CGOpenMPRuntime::emitRegistrationFunction() { // If we have offloading in the current module, we need to emit the entries // now and register the offloading descriptor. createOffloadEntriesAndInfoMetadata(); // Create and register the offloading binary descriptors. This is the main // entity that captures all the information about offloading in the current // compilation unit. return createOffloadingBinaryDescriptorRegistration(); } Index: projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGStmtOpenMP.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGStmtOpenMP.cpp (revision 294609) @@ -1,2679 +1,2679 @@ //===--- CGStmtOpenMP.cpp - Emit LLVM Code from Statements ----------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This contains code to emit OpenMP nodes as LLVM code. // //===----------------------------------------------------------------------===// #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "TargetInfo.h" #include "clang/AST/Stmt.h" #include "clang/AST/StmtOpenMP.h" using namespace clang; using namespace CodeGen; void CodeGenFunction::GenerateOpenMPCapturedVars( const CapturedStmt &S, SmallVectorImpl &CapturedVars) { const RecordDecl *RD = S.getCapturedRecordDecl(); auto CurField = RD->field_begin(); auto CurCap = S.captures().begin(); for (CapturedStmt::const_capture_init_iterator I = S.capture_init_begin(), E = S.capture_init_end(); I != E; ++I, ++CurField, ++CurCap) { if (CurField->hasCapturedVLAType()) { auto VAT = CurField->getCapturedVLAType(); auto *Val = VLASizeMap[VAT->getSizeExpr()]; CapturedVars.push_back(Val); } else if (CurCap->capturesThis()) CapturedVars.push_back(CXXThisValue); else if (CurCap->capturesVariableByCopy()) CapturedVars.push_back( EmitLoadOfLValue(EmitLValue(*I), SourceLocation()).getScalarVal()); else { assert(CurCap->capturesVariable() && "Expected capture by reference."); CapturedVars.push_back(EmitLValue(*I).getAddress().getPointer()); } } } static Address castValueFromUintptr(CodeGenFunction &CGF, QualType DstType, StringRef Name, LValue AddrLV, bool isReferenceType = false) { ASTContext &Ctx = CGF.getContext(); auto *CastedPtr = CGF.EmitScalarConversion( AddrLV.getAddress().getPointer(), Ctx.getUIntPtrType(), Ctx.getPointerType(DstType), SourceLocation()); auto TmpAddr = CGF.MakeNaturalAlignAddrLValue(CastedPtr, Ctx.getPointerType(DstType)) .getAddress(); // If we are dealing with references we need to return the address of the // reference instead of the reference of the value. if (isReferenceType) { QualType RefType = Ctx.getLValueReferenceType(DstType); auto *RefVal = TmpAddr.getPointer(); TmpAddr = CGF.CreateMemTemp(RefType, Twine(Name) + ".ref"); auto TmpLVal = CGF.MakeAddrLValue(TmpAddr, RefType); CGF.EmitScalarInit(RefVal, TmpLVal); } return TmpAddr; } llvm::Function * CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) { assert( CapturedStmtInfo && "CapturedStmtInfo should be set when generating the captured function"); const CapturedDecl *CD = S.getCapturedDecl(); const RecordDecl *RD = S.getCapturedRecordDecl(); assert(CD->hasBody() && "missing CapturedDecl body"); // Build the argument list. ASTContext &Ctx = CGM.getContext(); FunctionArgList Args; Args.append(CD->param_begin(), std::next(CD->param_begin(), CD->getContextParamPosition())); auto I = S.captures().begin(); for (auto *FD : RD->fields()) { QualType ArgType = FD->getType(); IdentifierInfo *II = nullptr; VarDecl *CapVar = nullptr; // If this is a capture by copy and the type is not a pointer, the outlined // function argument type should be uintptr and the value properly casted to // uintptr. This is necessary given that the runtime library is only able to // deal with pointers. We can pass in the same way the VLA type sizes to the // outlined function. if ((I->capturesVariableByCopy() && !ArgType->isAnyPointerType()) || I->capturesVariableArrayType()) ArgType = Ctx.getUIntPtrType(); if (I->capturesVariable() || I->capturesVariableByCopy()) { CapVar = I->getCapturedVar(); II = CapVar->getIdentifier(); } else if (I->capturesThis()) II = &getContext().Idents.get("this"); else { assert(I->capturesVariableArrayType()); II = &getContext().Idents.get("vla"); } if (ArgType->isVariablyModifiedType()) ArgType = getContext().getVariableArrayDecayedType(ArgType); Args.push_back(ImplicitParamDecl::Create(getContext(), nullptr, FD->getLocation(), II, ArgType)); ++I; } Args.append( std::next(CD->param_begin(), CD->getContextParamPosition() + 1), CD->param_end()); // Create the function declaration. FunctionType::ExtInfo ExtInfo; const CGFunctionInfo &FuncInfo = CGM.getTypes().arrangeFreeFunctionDeclaration(Ctx.VoidTy, Args, ExtInfo, /*IsVariadic=*/false); llvm::FunctionType *FuncLLVMTy = CGM.getTypes().GetFunctionType(FuncInfo); llvm::Function *F = llvm::Function::Create( FuncLLVMTy, llvm::GlobalValue::InternalLinkage, CapturedStmtInfo->getHelperName(), &CGM.getModule()); CGM.SetInternalFunctionAttributes(CD, F, FuncInfo); if (CD->isNothrow()) F->addFnAttr(llvm::Attribute::NoUnwind); // Generate the function. StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, CD->getLocation(), CD->getBody()->getLocStart()); unsigned Cnt = CD->getContextParamPosition(); I = S.captures().begin(); for (auto *FD : RD->fields()) { // If we are capturing a pointer by copy we don't need to do anything, just // use the value that we get from the arguments. if (I->capturesVariableByCopy() && FD->getType()->isAnyPointerType()) { setAddrOfLocalVar(I->getCapturedVar(), GetAddrOfLocalVar(Args[Cnt])); ++Cnt, ++I; continue; } LValue ArgLVal = MakeAddrLValue(GetAddrOfLocalVar(Args[Cnt]), Args[Cnt]->getType(), AlignmentSource::Decl); if (FD->hasCapturedVLAType()) { LValue CastedArgLVal = MakeAddrLValue(castValueFromUintptr(*this, FD->getType(), Args[Cnt]->getName(), ArgLVal), FD->getType(), AlignmentSource::Decl); auto *ExprArg = EmitLoadOfLValue(CastedArgLVal, SourceLocation()).getScalarVal(); auto VAT = FD->getCapturedVLAType(); VLASizeMap[VAT->getSizeExpr()] = ExprArg; } else if (I->capturesVariable()) { auto *Var = I->getCapturedVar(); QualType VarTy = Var->getType(); Address ArgAddr = ArgLVal.getAddress(); if (!VarTy->isReferenceType()) { ArgAddr = EmitLoadOfReference( ArgAddr, ArgLVal.getType()->castAs()); } setAddrOfLocalVar( Var, Address(ArgAddr.getPointer(), getContext().getDeclAlign(Var))); } else if (I->capturesVariableByCopy()) { assert(!FD->getType()->isAnyPointerType() && "Not expecting a captured pointer."); auto *Var = I->getCapturedVar(); QualType VarTy = Var->getType(); setAddrOfLocalVar(I->getCapturedVar(), castValueFromUintptr(*this, FD->getType(), Args[Cnt]->getName(), ArgLVal, VarTy->isReferenceType())); } else { // If 'this' is captured, load it into CXXThisValue. assert(I->capturesThis()); CXXThisValue = EmitLoadOfLValue(ArgLVal, Args[Cnt]->getLocation()).getScalarVal(); } ++Cnt, ++I; } PGO.assignRegionCounters(GlobalDecl(CD), F); CapturedStmtInfo->EmitBody(*this, CD->getBody()); FinishFunction(CD->getBodyRBrace()); return F; } //===----------------------------------------------------------------------===// // OpenMP Directive Emission //===----------------------------------------------------------------------===// void CodeGenFunction::EmitOMPAggregateAssign( Address DestAddr, Address SrcAddr, QualType OriginalType, const llvm::function_ref &CopyGen) { // Perform element-by-element initialization. QualType ElementTy; // Drill down to the base element type on both arrays. auto ArrayTy = OriginalType->getAsArrayTypeUnsafe(); auto NumElements = emitArrayLength(ArrayTy, ElementTy, DestAddr); SrcAddr = Builder.CreateElementBitCast(SrcAddr, DestAddr.getElementType()); auto SrcBegin = SrcAddr.getPointer(); auto DestBegin = DestAddr.getPointer(); // Cast from pointer to array type to pointer to single element. auto DestEnd = Builder.CreateGEP(DestBegin, NumElements); // The basic structure here is a while-do loop. auto BodyBB = createBasicBlock("omp.arraycpy.body"); auto DoneBB = createBasicBlock("omp.arraycpy.done"); auto IsEmpty = Builder.CreateICmpEQ(DestBegin, DestEnd, "omp.arraycpy.isempty"); Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); // Enter the loop body, making that address the current address. auto EntryBB = Builder.GetInsertBlock(); EmitBlock(BodyBB); CharUnits ElementSize = getContext().getTypeSizeInChars(ElementTy); llvm::PHINode *SrcElementPHI = Builder.CreatePHI(SrcBegin->getType(), 2, "omp.arraycpy.srcElementPast"); SrcElementPHI->addIncoming(SrcBegin, EntryBB); Address SrcElementCurrent = Address(SrcElementPHI, SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize)); llvm::PHINode *DestElementPHI = Builder.CreatePHI(DestBegin->getType(), 2, "omp.arraycpy.destElementPast"); DestElementPHI->addIncoming(DestBegin, EntryBB); Address DestElementCurrent = Address(DestElementPHI, DestAddr.getAlignment().alignmentOfArrayElement(ElementSize)); // Emit copy. CopyGen(DestElementCurrent, SrcElementCurrent); // Shift the address forward by one element. auto DestElementNext = Builder.CreateConstGEP1_32( DestElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element"); auto SrcElementNext = Builder.CreateConstGEP1_32( SrcElementPHI, /*Idx0=*/1, "omp.arraycpy.src.element"); // Check whether we've reached the end. auto Done = Builder.CreateICmpEQ(DestElementNext, DestEnd, "omp.arraycpy.done"); Builder.CreateCondBr(Done, DoneBB, BodyBB); DestElementPHI->addIncoming(DestElementNext, Builder.GetInsertBlock()); SrcElementPHI->addIncoming(SrcElementNext, Builder.GetInsertBlock()); // Done. EmitBlock(DoneBB, /*IsFinished=*/true); } /// \brief Emit initialization of arrays of complex types. /// \param DestAddr Address of the array. /// \param Type Type of array. /// \param Init Initial expression of array. static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr, QualType Type, const Expr *Init) { // Perform element-by-element initialization. QualType ElementTy; // Drill down to the base element type on both arrays. auto ArrayTy = Type->getAsArrayTypeUnsafe(); auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, DestAddr); DestAddr = CGF.Builder.CreateElementBitCast(DestAddr, DestAddr.getElementType()); auto DestBegin = DestAddr.getPointer(); // Cast from pointer to array type to pointer to single element. auto DestEnd = CGF.Builder.CreateGEP(DestBegin, NumElements); // The basic structure here is a while-do loop. auto BodyBB = CGF.createBasicBlock("omp.arrayinit.body"); auto DoneBB = CGF.createBasicBlock("omp.arrayinit.done"); auto IsEmpty = CGF.Builder.CreateICmpEQ(DestBegin, DestEnd, "omp.arrayinit.isempty"); CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); // Enter the loop body, making that address the current address. auto EntryBB = CGF.Builder.GetInsertBlock(); CGF.EmitBlock(BodyBB); CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy); llvm::PHINode *DestElementPHI = CGF.Builder.CreatePHI( DestBegin->getType(), 2, "omp.arraycpy.destElementPast"); DestElementPHI->addIncoming(DestBegin, EntryBB); Address DestElementCurrent = Address(DestElementPHI, DestAddr.getAlignment().alignmentOfArrayElement(ElementSize)); // Emit copy. { CodeGenFunction::RunCleanupsScope InitScope(CGF); CGF.EmitAnyExprToMem(Init, DestElementCurrent, ElementTy.getQualifiers(), /*IsInitializer=*/false); } // Shift the address forward by one element. auto DestElementNext = CGF.Builder.CreateConstGEP1_32( DestElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element"); // Check whether we've reached the end. auto Done = CGF.Builder.CreateICmpEQ(DestElementNext, DestEnd, "omp.arraycpy.done"); CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB); DestElementPHI->addIncoming(DestElementNext, CGF.Builder.GetInsertBlock()); // Done. CGF.EmitBlock(DoneBB, /*IsFinished=*/true); } void CodeGenFunction::EmitOMPCopy(QualType OriginalType, Address DestAddr, Address SrcAddr, const VarDecl *DestVD, const VarDecl *SrcVD, const Expr *Copy) { if (OriginalType->isArrayType()) { auto *BO = dyn_cast(Copy); if (BO && BO->getOpcode() == BO_Assign) { // Perform simple memcpy for simple copying. EmitAggregateAssign(DestAddr, SrcAddr, OriginalType); } else { // For arrays with complex element types perform element by element // copying. EmitOMPAggregateAssign( DestAddr, SrcAddr, OriginalType, [this, Copy, SrcVD, DestVD](Address DestElement, Address SrcElement) { // Working with the single array element, so have to remap // destination and source variables to corresponding array // elements. CodeGenFunction::OMPPrivateScope Remap(*this); Remap.addPrivate(DestVD, [DestElement]() -> Address { return DestElement; }); Remap.addPrivate( SrcVD, [SrcElement]() -> Address { return SrcElement; }); (void)Remap.Privatize(); EmitIgnoredExpr(Copy); }); } } else { // Remap pseudo source variable to private copy. CodeGenFunction::OMPPrivateScope Remap(*this); Remap.addPrivate(SrcVD, [SrcAddr]() -> Address { return SrcAddr; }); Remap.addPrivate(DestVD, [DestAddr]() -> Address { return DestAddr; }); (void)Remap.Privatize(); // Emit copying of the whole variable. EmitIgnoredExpr(Copy); } } bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D, OMPPrivateScope &PrivateScope) { if (!HaveInsertPoint()) return false; llvm::DenseSet EmittedAsFirstprivate; for (const auto *C : D.getClausesOfKind()) { auto IRef = C->varlist_begin(); auto InitsRef = C->inits().begin(); for (auto IInit : C->private_copies()) { auto *OrigVD = cast(cast(*IRef)->getDecl()); if (EmittedAsFirstprivate.count(OrigVD) == 0) { EmittedAsFirstprivate.insert(OrigVD); auto *VD = cast(cast(IInit)->getDecl()); auto *VDInit = cast(cast(*InitsRef)->getDecl()); bool IsRegistered; DeclRefExpr DRE( const_cast(OrigVD), /*RefersToEnclosingVariableOrCapture=*/CapturedStmtInfo->lookup( OrigVD) != nullptr, (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc()); Address OriginalAddr = EmitLValue(&DRE).getAddress(); QualType Type = OrigVD->getType(); if (Type->isArrayType()) { // Emit VarDecl with copy init for arrays. // Get the address of the original variable captured in current // captured region. IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address { auto Emission = EmitAutoVarAlloca(*VD); auto *Init = VD->getInit(); if (!isa(Init) || isTrivialInitializer(Init)) { // Perform simple memcpy. EmitAggregateAssign(Emission.getAllocatedAddress(), OriginalAddr, Type); } else { EmitOMPAggregateAssign( Emission.getAllocatedAddress(), OriginalAddr, Type, [this, VDInit, Init](Address DestElement, Address SrcElement) { // Clean up any temporaries needed by the initialization. RunCleanupsScope InitScope(*this); // Emit initialization for single element. setAddrOfLocalVar(VDInit, SrcElement); EmitAnyExprToMem(Init, DestElement, Init->getType().getQualifiers(), /*IsInitializer*/ false); LocalDeclMap.erase(VDInit); }); } EmitAutoVarCleanups(Emission); return Emission.getAllocatedAddress(); }); } else { IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address { // Emit private VarDecl with copy init. // Remap temp VDInit variable to the address of the original // variable // (for proper handling of captured global variables). setAddrOfLocalVar(VDInit, OriginalAddr); EmitDecl(*VD); LocalDeclMap.erase(VDInit); return GetAddrOfLocalVar(VD); }); } assert(IsRegistered && "firstprivate var already registered as private"); // Silence the warning about unused variable. (void)IsRegistered; } ++IRef, ++InitsRef; } } return !EmittedAsFirstprivate.empty(); } void CodeGenFunction::EmitOMPPrivateClause( const OMPExecutableDirective &D, CodeGenFunction::OMPPrivateScope &PrivateScope) { if (!HaveInsertPoint()) return; llvm::DenseSet EmittedAsPrivate; for (const auto *C : D.getClausesOfKind()) { auto IRef = C->varlist_begin(); for (auto IInit : C->private_copies()) { auto *OrigVD = cast(cast(*IRef)->getDecl()); if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) { auto VD = cast(cast(IInit)->getDecl()); bool IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address { // Emit private VarDecl with copy init. EmitDecl(*VD); return GetAddrOfLocalVar(VD); }); assert(IsRegistered && "private var already registered as private"); // Silence the warning about unused variable. (void)IsRegistered; } ++IRef; } } } bool CodeGenFunction::EmitOMPCopyinClause(const OMPExecutableDirective &D) { if (!HaveInsertPoint()) return false; // threadprivate_var1 = master_threadprivate_var1; // operator=(threadprivate_var2, master_threadprivate_var2); // ... // __kmpc_barrier(&loc, global_tid); llvm::DenseSet CopiedVars; llvm::BasicBlock *CopyBegin = nullptr, *CopyEnd = nullptr; for (const auto *C : D.getClausesOfKind()) { auto IRef = C->varlist_begin(); auto ISrcRef = C->source_exprs().begin(); auto IDestRef = C->destination_exprs().begin(); for (auto *AssignOp : C->assignment_ops()) { auto *VD = cast(cast(*IRef)->getDecl()); QualType Type = VD->getType(); if (CopiedVars.insert(VD->getCanonicalDecl()).second) { // Get the address of the master variable. If we are emitting code with // TLS support, the address is passed from the master as field in the // captured declaration. Address MasterAddr = Address::invalid(); if (getLangOpts().OpenMPUseTLS && getContext().getTargetInfo().isTLSSupported()) { assert(CapturedStmtInfo->lookup(VD) && "Copyin threadprivates should have been captured!"); DeclRefExpr DRE(const_cast(VD), true, (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc()); MasterAddr = EmitLValue(&DRE).getAddress(); LocalDeclMap.erase(VD); } else { MasterAddr = Address(VD->isStaticLocal() ? CGM.getStaticLocalDeclAddress(VD) : CGM.GetAddrOfGlobal(VD), getContext().getDeclAlign(VD)); } // Get the address of the threadprivate variable. Address PrivateAddr = EmitLValue(*IRef).getAddress(); if (CopiedVars.size() == 1) { // At first check if current thread is a master thread. If it is, no // need to copy data. CopyBegin = createBasicBlock("copyin.not.master"); CopyEnd = createBasicBlock("copyin.not.master.end"); Builder.CreateCondBr( Builder.CreateICmpNE( Builder.CreatePtrToInt(MasterAddr.getPointer(), CGM.IntPtrTy), Builder.CreatePtrToInt(PrivateAddr.getPointer(), CGM.IntPtrTy)), CopyBegin, CopyEnd); EmitBlock(CopyBegin); } auto *SrcVD = cast(cast(*ISrcRef)->getDecl()); auto *DestVD = cast(cast(*IDestRef)->getDecl()); EmitOMPCopy(Type, PrivateAddr, MasterAddr, DestVD, SrcVD, AssignOp); } ++IRef; ++ISrcRef; ++IDestRef; } } if (CopyEnd) { // Exit out of copying procedure for non-master thread. EmitBlock(CopyEnd, /*IsFinished=*/true); return true; } return false; } bool CodeGenFunction::EmitOMPLastprivateClauseInit( const OMPExecutableDirective &D, OMPPrivateScope &PrivateScope) { if (!HaveInsertPoint()) return false; bool HasAtLeastOneLastprivate = false; llvm::DenseSet AlreadyEmittedVars; for (const auto *C : D.getClausesOfKind()) { HasAtLeastOneLastprivate = true; auto IRef = C->varlist_begin(); auto IDestRef = C->destination_exprs().begin(); for (auto *IInit : C->private_copies()) { // Keep the address of the original variable for future update at the end // of the loop. auto *OrigVD = cast(cast(*IRef)->getDecl()); if (AlreadyEmittedVars.insert(OrigVD->getCanonicalDecl()).second) { auto *DestVD = cast(cast(*IDestRef)->getDecl()); PrivateScope.addPrivate(DestVD, [this, OrigVD, IRef]() -> Address { DeclRefExpr DRE( const_cast(OrigVD), /*RefersToEnclosingVariableOrCapture=*/CapturedStmtInfo->lookup( OrigVD) != nullptr, (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc()); return EmitLValue(&DRE).getAddress(); }); // Check if the variable is also a firstprivate: in this case IInit is // not generated. Initialization of this variable will happen in codegen // for 'firstprivate' clause. if (IInit) { auto *VD = cast(cast(IInit)->getDecl()); bool IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address { // Emit private VarDecl with copy init. EmitDecl(*VD); return GetAddrOfLocalVar(VD); }); assert(IsRegistered && "lastprivate var already registered as private"); (void)IsRegistered; } } ++IRef, ++IDestRef; } } return HasAtLeastOneLastprivate; } void CodeGenFunction::EmitOMPLastprivateClauseFinal( const OMPExecutableDirective &D, llvm::Value *IsLastIterCond) { if (!HaveInsertPoint()) return; // Emit following code: // if () { // orig_var1 = private_orig_var1; // ... // orig_varn = private_orig_varn; // } llvm::BasicBlock *ThenBB = nullptr; llvm::BasicBlock *DoneBB = nullptr; if (IsLastIterCond) { ThenBB = createBasicBlock(".omp.lastprivate.then"); DoneBB = createBasicBlock(".omp.lastprivate.done"); Builder.CreateCondBr(IsLastIterCond, ThenBB, DoneBB); EmitBlock(ThenBB); } llvm::DenseMap LoopCountersAndUpdates; const Expr *LastIterVal = nullptr; const Expr *IVExpr = nullptr; const Expr *IncExpr = nullptr; if (auto *LoopDirective = dyn_cast(&D)) { if (isOpenMPWorksharingDirective(D.getDirectiveKind())) { LastIterVal = cast(cast( LoopDirective->getUpperBoundVariable()) ->getDecl()) ->getAnyInitializer(); IVExpr = LoopDirective->getIterationVariable(); IncExpr = LoopDirective->getInc(); auto IUpdate = LoopDirective->updates().begin(); for (auto *E : LoopDirective->counters()) { auto *D = cast(E)->getDecl()->getCanonicalDecl(); LoopCountersAndUpdates[D] = *IUpdate; ++IUpdate; } } } { llvm::DenseSet AlreadyEmittedVars; bool FirstLCV = true; for (const auto *C : D.getClausesOfKind()) { auto IRef = C->varlist_begin(); auto ISrcRef = C->source_exprs().begin(); auto IDestRef = C->destination_exprs().begin(); for (auto *AssignOp : C->assignment_ops()) { auto *PrivateVD = cast(cast(*IRef)->getDecl()); QualType Type = PrivateVD->getType(); auto *CanonicalVD = PrivateVD->getCanonicalDecl(); if (AlreadyEmittedVars.insert(CanonicalVD).second) { // If lastprivate variable is a loop control variable for loop-based // directive, update its value before copyin back to original // variable. if (auto *UpExpr = LoopCountersAndUpdates.lookup(CanonicalVD)) { if (FirstLCV && LastIterVal) { EmitAnyExprToMem(LastIterVal, EmitLValue(IVExpr).getAddress(), IVExpr->getType().getQualifiers(), /*IsInitializer=*/false); EmitIgnoredExpr(IncExpr); FirstLCV = false; } EmitIgnoredExpr(UpExpr); } auto *SrcVD = cast(cast(*ISrcRef)->getDecl()); auto *DestVD = cast(cast(*IDestRef)->getDecl()); // Get the address of the original variable. Address OriginalAddr = GetAddrOfLocalVar(DestVD); // Get the address of the private variable. Address PrivateAddr = GetAddrOfLocalVar(PrivateVD); if (auto RefTy = PrivateVD->getType()->getAs()) PrivateAddr = Address(Builder.CreateLoad(PrivateAddr), getNaturalTypeAlignment(RefTy->getPointeeType())); EmitOMPCopy(Type, OriginalAddr, PrivateAddr, DestVD, SrcVD, AssignOp); } ++IRef; ++ISrcRef; ++IDestRef; } } } if (IsLastIterCond) { EmitBlock(DoneBB, /*IsFinished=*/true); } } void CodeGenFunction::EmitOMPReductionClauseInit( const OMPExecutableDirective &D, CodeGenFunction::OMPPrivateScope &PrivateScope) { if (!HaveInsertPoint()) return; for (const auto *C : D.getClausesOfKind()) { auto ILHS = C->lhs_exprs().begin(); auto IRHS = C->rhs_exprs().begin(); auto IPriv = C->privates().begin(); for (auto IRef : C->varlists()) { auto *LHSVD = cast(cast(*ILHS)->getDecl()); auto *RHSVD = cast(cast(*IRHS)->getDecl()); auto *PrivateVD = cast(cast(*IPriv)->getDecl()); if (auto *OASE = dyn_cast(IRef)) { auto *Base = OASE->getBase()->IgnoreParenImpCasts(); while (auto *TempOASE = dyn_cast(Base)) Base = TempOASE->getBase()->IgnoreParenImpCasts(); while (auto *TempASE = dyn_cast(Base)) Base = TempASE->getBase()->IgnoreParenImpCasts(); auto *DE = cast(Base); auto *OrigVD = cast(DE->getDecl()); auto OASELValueLB = EmitOMPArraySectionExpr(OASE); auto OASELValueUB = EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false); auto OriginalBaseLValue = EmitLValue(DE); auto BaseLValue = OriginalBaseLValue; auto *Zero = Builder.getInt64(/*C=*/0); llvm::SmallVector Indexes; Indexes.push_back(Zero); auto *ItemTy = OASELValueLB.getPointer()->getType()->getPointerElementType(); auto *Ty = BaseLValue.getPointer()->getType()->getPointerElementType(); while (Ty != ItemTy) { Indexes.push_back(Zero); Ty = Ty->getPointerElementType(); } BaseLValue = MakeAddrLValue( Address(Builder.CreateInBoundsGEP(BaseLValue.getPointer(), Indexes), OASELValueLB.getAlignment()), OASELValueLB.getType(), OASELValueLB.getAlignmentSource()); // Store the address of the original variable associated with the LHS // implicit variable. PrivateScope.addPrivate(LHSVD, [this, OASELValueLB]() -> Address { return OASELValueLB.getAddress(); }); // Emit reduction copy. bool IsRegistered = PrivateScope.addPrivate( OrigVD, [this, PrivateVD, BaseLValue, OASELValueLB, OASELValueUB, OriginalBaseLValue]() -> Address { // Emit VarDecl with copy init for arrays. // Get the address of the original variable captured in current // captured region. auto *Size = Builder.CreatePtrDiff(OASELValueUB.getPointer(), OASELValueLB.getPointer()); Size = Builder.CreateNUWAdd( Size, llvm::ConstantInt::get(Size->getType(), /*V=*/1)); CodeGenFunction::OpaqueValueMapping OpaqueMap( *this, cast( getContext() .getAsVariableArrayType(PrivateVD->getType()) ->getSizeExpr()), RValue::get(Size)); EmitVariablyModifiedType(PrivateVD->getType()); auto Emission = EmitAutoVarAlloca(*PrivateVD); auto Addr = Emission.getAllocatedAddress(); auto *Init = PrivateVD->getInit(); EmitOMPAggregateInit(*this, Addr, PrivateVD->getType(), Init); EmitAutoVarCleanups(Emission); // Emit private VarDecl with reduction init. auto *Offset = Builder.CreatePtrDiff(BaseLValue.getPointer(), OASELValueLB.getPointer()); auto *Ptr = Builder.CreateGEP(Addr.getPointer(), Offset); Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast( Ptr, OriginalBaseLValue.getPointer()->getType()); return Address(Ptr, OriginalBaseLValue.getAlignment()); }); assert(IsRegistered && "private var already registered as private"); // Silence the warning about unused variable. (void)IsRegistered; PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address { return GetAddrOfLocalVar(PrivateVD); }); } else if (auto *ASE = dyn_cast(IRef)) { auto *Base = ASE->getBase()->IgnoreParenImpCasts(); while (auto *TempASE = dyn_cast(Base)) Base = TempASE->getBase()->IgnoreParenImpCasts(); auto *DE = cast(Base); auto *OrigVD = cast(DE->getDecl()); auto ASELValue = EmitLValue(ASE); auto OriginalBaseLValue = EmitLValue(DE); auto BaseLValue = OriginalBaseLValue; auto *Zero = Builder.getInt64(/*C=*/0); llvm::SmallVector Indexes; Indexes.push_back(Zero); auto *ItemTy = ASELValue.getPointer()->getType()->getPointerElementType(); auto *Ty = BaseLValue.getPointer()->getType()->getPointerElementType(); while (Ty != ItemTy) { Indexes.push_back(Zero); Ty = Ty->getPointerElementType(); } BaseLValue = MakeAddrLValue( Address(Builder.CreateInBoundsGEP(BaseLValue.getPointer(), Indexes), ASELValue.getAlignment()), ASELValue.getType(), ASELValue.getAlignmentSource()); // Store the address of the original variable associated with the LHS // implicit variable. PrivateScope.addPrivate(LHSVD, [this, ASELValue]() -> Address { return ASELValue.getAddress(); }); // Emit reduction copy. bool IsRegistered = PrivateScope.addPrivate( OrigVD, [this, PrivateVD, BaseLValue, ASELValue, OriginalBaseLValue]() -> Address { // Emit private VarDecl with reduction init. EmitDecl(*PrivateVD); auto Addr = GetAddrOfLocalVar(PrivateVD); auto *Offset = Builder.CreatePtrDiff(BaseLValue.getPointer(), ASELValue.getPointer()); auto *Ptr = Builder.CreateGEP(Addr.getPointer(), Offset); Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast( Ptr, OriginalBaseLValue.getPointer()->getType()); return Address(Ptr, OriginalBaseLValue.getAlignment()); }); assert(IsRegistered && "private var already registered as private"); // Silence the warning about unused variable. (void)IsRegistered; PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address { return GetAddrOfLocalVar(PrivateVD); }); } else { auto *OrigVD = cast(cast(IRef)->getDecl()); // Store the address of the original variable associated with the LHS // implicit variable. PrivateScope.addPrivate(LHSVD, [this, OrigVD, IRef]() -> Address { DeclRefExpr DRE(const_cast(OrigVD), CapturedStmtInfo->lookup(OrigVD) != nullptr, IRef->getType(), VK_LValue, IRef->getExprLoc()); return EmitLValue(&DRE).getAddress(); }); // Emit reduction copy. bool IsRegistered = PrivateScope.addPrivate(OrigVD, [this, PrivateVD]() -> Address { // Emit private VarDecl with reduction init. EmitDecl(*PrivateVD); return GetAddrOfLocalVar(PrivateVD); }); assert(IsRegistered && "private var already registered as private"); // Silence the warning about unused variable. (void)IsRegistered; PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address { return GetAddrOfLocalVar(PrivateVD); }); } ++ILHS, ++IRHS, ++IPriv; } } } void CodeGenFunction::EmitOMPReductionClauseFinal( const OMPExecutableDirective &D) { if (!HaveInsertPoint()) return; llvm::SmallVector Privates; llvm::SmallVector LHSExprs; llvm::SmallVector RHSExprs; llvm::SmallVector ReductionOps; bool HasAtLeastOneReduction = false; for (const auto *C : D.getClausesOfKind()) { HasAtLeastOneReduction = true; Privates.append(C->privates().begin(), C->privates().end()); LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); } if (HasAtLeastOneReduction) { // Emit nowait reduction if nowait clause is present or directive is a // parallel directive (it always has implicit barrier). CGM.getOpenMPRuntime().emitReduction( *this, D.getLocEnd(), Privates, LHSExprs, RHSExprs, ReductionOps, D.getSingleClause() || isOpenMPParallelDirective(D.getDirectiveKind()) || D.getDirectiveKind() == OMPD_simd, D.getDirectiveKind() == OMPD_simd); } } static void emitCommonOMPParallelDirective(CodeGenFunction &CGF, const OMPExecutableDirective &S, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { auto CS = cast(S.getAssociatedStmt()); llvm::SmallVector CapturedVars; CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); auto OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction( S, *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen); if (const auto *NumThreadsClause = S.getSingleClause()) { CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF); auto NumThreads = CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(), /*IgnoreResultAssign*/ true); CGF.CGM.getOpenMPRuntime().emitNumThreadsClause( CGF, NumThreads, NumThreadsClause->getLocStart()); } if (const auto *ProcBindClause = S.getSingleClause()) { CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF); CGF.CGM.getOpenMPRuntime().emitProcBindClause( CGF, ProcBindClause->getProcBindKind(), ProcBindClause->getLocStart()); } const Expr *IfCond = nullptr; for (const auto *C : S.getClausesOfKind()) { if (C->getNameModifier() == OMPD_unknown || C->getNameModifier() == OMPD_parallel) { IfCond = C->getCondition(); break; } } CGF.CGM.getOpenMPRuntime().emitParallelCall(CGF, S.getLocStart(), OutlinedFn, CapturedVars, IfCond); } void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); // Emit parallel region as a standalone region. auto &&CodeGen = [&S](CodeGenFunction &CGF) { OMPPrivateScope PrivateScope(CGF); bool Copyins = CGF.EmitOMPCopyinClause(S); bool Firstprivates = CGF.EmitOMPFirstprivateClause(S, PrivateScope); if (Copyins || Firstprivates) { // Emit implicit barrier to synchronize threads and avoid data races on // initialization of firstprivate variables or propagation master's thread // values of threadprivate variables to local instances of that variables // of all other implicit threads. CGF.CGM.getOpenMPRuntime().emitBarrierCall( CGF, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false, /*ForceSimpleCall=*/true); } CGF.EmitOMPPrivateClause(S, PrivateScope); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); CGF.EmitOMPReductionClauseFinal(S); }; emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen); } void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D, JumpDest LoopExit) { RunCleanupsScope BodyScope(*this); // Update counters values on current iteration. for (auto I : D.updates()) { EmitIgnoredExpr(I); } // Update the linear variables. for (const auto *C : D.getClausesOfKind()) { for (auto U : C->updates()) { EmitIgnoredExpr(U); } } // On a continue in the body, jump to the end. auto Continue = getJumpDestInCurrentScope("omp.body.continue"); BreakContinueStack.push_back(BreakContinue(LoopExit, Continue)); // Emit loop body. EmitStmt(D.getBody()); // The end (updates/cleanups). EmitBlock(Continue.getBlock()); BreakContinueStack.pop_back(); // TODO: Update lastprivates if the SeparateIter flag is true. // This will be implemented in a follow-up OMPLastprivateClause patch, but // result should be still correct without it, as we do not make these // variables private yet. } void CodeGenFunction::EmitOMPInnerLoop( const Stmt &S, bool RequiresCleanup, const Expr *LoopCond, const Expr *IncExpr, const llvm::function_ref &BodyGen, const llvm::function_ref &PostIncGen) { auto LoopExit = getJumpDestInCurrentScope("omp.inner.for.end"); // Start the loop with a block that tests the condition. auto CondBlock = createBasicBlock("omp.inner.for.cond"); EmitBlock(CondBlock); LoopStack.push(CondBlock); // If there are any cleanups between here and the loop-exit scope, // create a block to stage a loop exit along. auto ExitBlock = LoopExit.getBlock(); if (RequiresCleanup) ExitBlock = createBasicBlock("omp.inner.for.cond.cleanup"); auto LoopBody = createBasicBlock("omp.inner.for.body"); // Emit condition. EmitBranchOnBoolExpr(LoopCond, LoopBody, ExitBlock, getProfileCount(&S)); if (ExitBlock != LoopExit.getBlock()) { EmitBlock(ExitBlock); EmitBranchThroughCleanup(LoopExit); } EmitBlock(LoopBody); incrementProfileCounter(&S); // Create a block for the increment. auto Continue = getJumpDestInCurrentScope("omp.inner.for.inc"); BreakContinueStack.push_back(BreakContinue(LoopExit, Continue)); BodyGen(*this); // Emit "IV = IV + 1" and a back-edge to the condition block. EmitBlock(Continue.getBlock()); EmitIgnoredExpr(IncExpr); PostIncGen(*this); BreakContinueStack.pop_back(); EmitBranch(CondBlock); LoopStack.pop(); // Emit the fall-through block. EmitBlock(LoopExit.getBlock()); } void CodeGenFunction::EmitOMPLinearClauseInit(const OMPLoopDirective &D) { if (!HaveInsertPoint()) return; // Emit inits for the linear variables. for (const auto *C : D.getClausesOfKind()) { for (auto Init : C->inits()) { auto *VD = cast(cast(Init)->getDecl()); auto *OrigVD = cast( cast(VD->getInit()->IgnoreImpCasts())->getDecl()); DeclRefExpr DRE(const_cast(OrigVD), CapturedStmtInfo->lookup(OrigVD) != nullptr, VD->getInit()->getType(), VK_LValue, VD->getInit()->getExprLoc()); AutoVarEmission Emission = EmitAutoVarAlloca(*VD); EmitExprAsInit(&DRE, VD, MakeAddrLValue(Emission.getAllocatedAddress(), VD->getType()), /*capturedByInit=*/false); EmitAutoVarCleanups(Emission); } // Emit the linear steps for the linear clauses. // If a step is not constant, it is pre-calculated before the loop. if (auto CS = cast_or_null(C->getCalcStep())) if (auto SaveRef = cast(CS->getLHS())) { EmitVarDecl(*cast(SaveRef->getDecl())); // Emit calculation of the linear step. EmitIgnoredExpr(CS); } } } static void emitLinearClauseFinal(CodeGenFunction &CGF, const OMPLoopDirective &D) { if (!CGF.HaveInsertPoint()) return; // Emit the final values of the linear variables. for (const auto *C : D.getClausesOfKind()) { auto IC = C->varlist_begin(); for (auto F : C->finals()) { auto *OrigVD = cast(cast(*IC)->getDecl()); DeclRefExpr DRE(const_cast(OrigVD), CGF.CapturedStmtInfo->lookup(OrigVD) != nullptr, (*IC)->getType(), VK_LValue, (*IC)->getExprLoc()); Address OrigAddr = CGF.EmitLValue(&DRE).getAddress(); CodeGenFunction::OMPPrivateScope VarScope(CGF); VarScope.addPrivate(OrigVD, [OrigAddr]() -> Address { return OrigAddr; }); (void)VarScope.Privatize(); CGF.EmitIgnoredExpr(F); ++IC; } } } static void emitAlignedClause(CodeGenFunction &CGF, const OMPExecutableDirective &D) { if (!CGF.HaveInsertPoint()) return; for (const auto *Clause : D.getClausesOfKind()) { unsigned ClauseAlignment = 0; if (auto AlignmentExpr = Clause->getAlignment()) { auto AlignmentCI = cast(CGF.EmitScalarExpr(AlignmentExpr)); ClauseAlignment = static_cast(AlignmentCI->getZExtValue()); } for (auto E : Clause->varlists()) { unsigned Alignment = ClauseAlignment; if (Alignment == 0) { // OpenMP [2.8.1, Description] // If no optional parameter is specified, implementation-defined default // alignments for SIMD instructions on the target platforms are assumed. Alignment = CGF.getContext() .toCharUnitsFromBits(CGF.getContext().getOpenMPDefaultSimdAlign( E->getType()->getPointeeType())) .getQuantity(); } assert((Alignment == 0 || llvm::isPowerOf2_32(Alignment)) && "alignment is not power of 2"); if (Alignment != 0) { llvm::Value *PtrValue = CGF.EmitScalarExpr(E); CGF.EmitAlignmentAssumption(PtrValue, Alignment); } } } } static void emitPrivateLoopCounters(CodeGenFunction &CGF, CodeGenFunction::OMPPrivateScope &LoopScope, ArrayRef Counters, ArrayRef PrivateCounters) { if (!CGF.HaveInsertPoint()) return; auto I = PrivateCounters.begin(); for (auto *E : Counters) { auto *VD = cast(cast(E)->getDecl()); auto *PrivateVD = cast(cast(*I)->getDecl()); Address Addr = Address::invalid(); (void)LoopScope.addPrivate(PrivateVD, [&]() -> Address { // Emit var without initialization. auto VarEmission = CGF.EmitAutoVarAlloca(*PrivateVD); CGF.EmitAutoVarCleanups(VarEmission); Addr = VarEmission.getAllocatedAddress(); return Addr; }); (void)LoopScope.addPrivate(VD, [&]() -> Address { return Addr; }); ++I; } } static void emitPreCond(CodeGenFunction &CGF, const OMPLoopDirective &S, const Expr *Cond, llvm::BasicBlock *TrueBlock, llvm::BasicBlock *FalseBlock, uint64_t TrueCount) { if (!CGF.HaveInsertPoint()) return; { CodeGenFunction::OMPPrivateScope PreCondScope(CGF); emitPrivateLoopCounters(CGF, PreCondScope, S.counters(), S.private_counters()); (void)PreCondScope.Privatize(); // Get initial values of real counters. for (auto I : S.inits()) { CGF.EmitIgnoredExpr(I); } } // Check that loop is executed at least one time. CGF.EmitBranchOnBoolExpr(Cond, TrueBlock, FalseBlock, TrueCount); } static void emitPrivateLinearVars(CodeGenFunction &CGF, const OMPExecutableDirective &D, CodeGenFunction::OMPPrivateScope &PrivateScope) { if (!CGF.HaveInsertPoint()) return; for (const auto *C : D.getClausesOfKind()) { auto CurPrivate = C->privates().begin(); for (auto *E : C->varlists()) { auto *VD = cast(cast(E)->getDecl()); auto *PrivateVD = cast(cast(*CurPrivate)->getDecl()); bool IsRegistered = PrivateScope.addPrivate(VD, [&]() -> Address { // Emit private VarDecl with copy init. CGF.EmitVarDecl(*PrivateVD); return CGF.GetAddrOfLocalVar(PrivateVD); }); assert(IsRegistered && "linear var already registered as private"); // Silence the warning about unused variable. (void)IsRegistered; ++CurPrivate; } } } static void emitSimdlenSafelenClause(CodeGenFunction &CGF, const OMPExecutableDirective &D, bool IsMonotonic) { if (!CGF.HaveInsertPoint()) return; if (const auto *C = D.getSingleClause()) { RValue Len = CGF.EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(), /*ignoreResult=*/true); llvm::ConstantInt *Val = cast(Len.getScalarVal()); CGF.LoopStack.setVectorizeWidth(Val->getZExtValue()); // In presence of finite 'safelen', it may be unsafe to mark all // the memory instructions parallel, because loop-carried // dependences of 'safelen' iterations are possible. if (!IsMonotonic) CGF.LoopStack.setParallel(!D.getSingleClause()); } else if (const auto *C = D.getSingleClause()) { RValue Len = CGF.EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(), /*ignoreResult=*/true); llvm::ConstantInt *Val = cast(Len.getScalarVal()); CGF.LoopStack.setVectorizeWidth(Val->getZExtValue()); // In presence of finite 'safelen', it may be unsafe to mark all // the memory instructions parallel, because loop-carried // dependences of 'safelen' iterations are possible. CGF.LoopStack.setParallel(false); } } void CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective &D, bool IsMonotonic) { // Walk clauses and process safelen/lastprivate. LoopStack.setParallel(!IsMonotonic); LoopStack.setVectorizeEnable(true); emitSimdlenSafelenClause(*this, D, IsMonotonic); } void CodeGenFunction::EmitOMPSimdFinal(const OMPLoopDirective &D) { if (!HaveInsertPoint()) return; auto IC = D.counters().begin(); for (auto F : D.finals()) { auto *OrigVD = cast(cast((*IC))->getDecl()); if (LocalDeclMap.count(OrigVD) || CapturedStmtInfo->lookup(OrigVD)) { DeclRefExpr DRE(const_cast(OrigVD), CapturedStmtInfo->lookup(OrigVD) != nullptr, (*IC)->getType(), VK_LValue, (*IC)->getExprLoc()); Address OrigAddr = EmitLValue(&DRE).getAddress(); OMPPrivateScope VarScope(*this); VarScope.addPrivate(OrigVD, [OrigAddr]() -> Address { return OrigAddr; }); (void)VarScope.Privatize(); EmitIgnoredExpr(F); } ++IC; } emitLinearClauseFinal(*this, D); } void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) { auto &&CodeGen = [&S](CodeGenFunction &CGF) { // if (PreCond) { // for (IV in 0..LastIteration) BODY; // ; // } // // Emit: if (PreCond) - begin. // If the condition constant folds and can be elided, avoid emitting the // whole loop. bool CondConstant; llvm::BasicBlock *ContBlock = nullptr; if (CGF.ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) { if (!CondConstant) return; } else { auto *ThenBlock = CGF.createBasicBlock("simd.if.then"); ContBlock = CGF.createBasicBlock("simd.if.end"); emitPreCond(CGF, S, S.getPreCond(), ThenBlock, ContBlock, CGF.getProfileCount(&S)); CGF.EmitBlock(ThenBlock); CGF.incrementProfileCounter(&S); } // Emit the loop iteration variable. const Expr *IVExpr = S.getIterationVariable(); const VarDecl *IVDecl = cast(cast(IVExpr)->getDecl()); CGF.EmitVarDecl(*IVDecl); CGF.EmitIgnoredExpr(S.getInit()); // Emit the iterations count variable. // If it is not a variable, Sema decided to calculate iterations count on // each iteration (e.g., it is foldable into a constant). if (auto LIExpr = dyn_cast(S.getLastIteration())) { CGF.EmitVarDecl(*cast(LIExpr->getDecl())); // Emit calculation of the iterations count. CGF.EmitIgnoredExpr(S.getCalcLastIteration()); } CGF.EmitOMPSimdInit(S); emitAlignedClause(CGF, S); CGF.EmitOMPLinearClauseInit(S); bool HasLastprivateClause; { OMPPrivateScope LoopScope(CGF); emitPrivateLoopCounters(CGF, LoopScope, S.counters(), S.private_counters()); emitPrivateLinearVars(CGF, S, LoopScope); CGF.EmitOMPPrivateClause(S, LoopScope); CGF.EmitOMPReductionClauseInit(S, LoopScope); HasLastprivateClause = CGF.EmitOMPLastprivateClauseInit(S, LoopScope); (void)LoopScope.Privatize(); CGF.EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(), S.getInc(), [&S](CodeGenFunction &CGF) { CGF.EmitOMPLoopBody(S, JumpDest()); CGF.EmitStopPoint(&S); }, [](CodeGenFunction &) {}); // Emit final copy of the lastprivate variables at the end of loops. if (HasLastprivateClause) { CGF.EmitOMPLastprivateClauseFinal(S); } CGF.EmitOMPReductionClauseFinal(S); } CGF.EmitOMPSimdFinal(S); // Emit: if (PreCond) - end. if (ContBlock) { CGF.EmitBranch(ContBlock); CGF.EmitBlock(ContBlock, true); } }; CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen); } void CodeGenFunction::EmitOMPForOuterLoop( OpenMPScheduleClauseKind ScheduleKind, bool IsMonotonic, const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered, Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) { auto &RT = CGM.getOpenMPRuntime(); // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime). const bool DynamicOrOrdered = Ordered || RT.isDynamic(ScheduleKind); assert((Ordered || !RT.isStaticNonchunked(ScheduleKind, /*Chunked=*/Chunk != nullptr)) && "static non-chunked schedule does not need outer loop"); // Emit outer loop. // // OpenMP [2.7.1, Loop Construct, Description, table 2-1] // When schedule(dynamic,chunk_size) is specified, the iterations are // distributed to threads in the team in chunks as the threads request them. // Each thread executes a chunk of iterations, then requests another chunk, // until no chunks remain to be distributed. Each chunk contains chunk_size // iterations, except for the last chunk to be distributed, which may have // fewer iterations. When no chunk_size is specified, it defaults to 1. // // When schedule(guided,chunk_size) is specified, the iterations are assigned // to threads in the team in chunks as the executing threads request them. // Each thread executes a chunk of iterations, then requests another chunk, // until no chunks remain to be assigned. For a chunk_size of 1, the size of // each chunk is proportional to the number of unassigned iterations divided // by the number of threads in the team, decreasing to 1. For a chunk_size // with value k (greater than 1), the size of each chunk is determined in the // same way, with the restriction that the chunks do not contain fewer than k // iterations (except for the last chunk to be assigned, which may have fewer // than k iterations). // // When schedule(auto) is specified, the decision regarding scheduling is // delegated to the compiler and/or runtime system. The programmer gives the // implementation the freedom to choose any possible mapping of iterations to // threads in the team. // // When schedule(runtime) is specified, the decision regarding scheduling is // deferred until run time, and the schedule and chunk size are taken from the // run-sched-var ICV. If the ICV is set to auto, the schedule is // implementation defined // // while(__kmpc_dispatch_next(&LB, &UB)) { // idx = LB; // while (idx <= UB) { BODY; ++idx; // __kmpc_dispatch_fini_(4|8)[u](); // For ordered loops only. // } // inner loop // } // // OpenMP [2.7.1, Loop Construct, Description, table 2-1] // When schedule(static, chunk_size) is specified, iterations are divided into // chunks of size chunk_size, and the chunks are assigned to the threads in // the team in a round-robin fashion in the order of the thread number. // // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) { // while (idx <= UB) { BODY; ++idx; } // inner loop // LB = LB + ST; // UB = UB + ST; // } // const Expr *IVExpr = S.getIterationVariable(); const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); if (DynamicOrOrdered) { llvm::Value *UBVal = EmitScalarExpr(S.getLastIteration()); RT.emitForDispatchInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, Ordered, UBVal, Chunk); } else { RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, Ordered, IL, LB, UB, ST, Chunk); } auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end"); // Start the loop with a block that tests the condition. auto CondBlock = createBasicBlock("omp.dispatch.cond"); EmitBlock(CondBlock); LoopStack.push(CondBlock); llvm::Value *BoolCondVal = nullptr; if (!DynamicOrOrdered) { // UB = min(UB, GlobalUB) EmitIgnoredExpr(S.getEnsureUpperBound()); // IV = LB EmitIgnoredExpr(S.getInit()); // IV < UB BoolCondVal = EvaluateExprAsBool(S.getCond()); } else { BoolCondVal = RT.emitForNext(*this, S.getLocStart(), IVSize, IVSigned, IL, LB, UB, ST); } // If there are any cleanups between here and the loop-exit scope, // create a block to stage a loop exit along. auto ExitBlock = LoopExit.getBlock(); if (LoopScope.requiresCleanups()) ExitBlock = createBasicBlock("omp.dispatch.cleanup"); auto LoopBody = createBasicBlock("omp.dispatch.body"); Builder.CreateCondBr(BoolCondVal, LoopBody, ExitBlock); if (ExitBlock != LoopExit.getBlock()) { EmitBlock(ExitBlock); EmitBranchThroughCleanup(LoopExit); } EmitBlock(LoopBody); // Emit "IV = LB" (in case of static schedule, we have already calculated new // LB for loop condition and emitted it above). if (DynamicOrOrdered) EmitIgnoredExpr(S.getInit()); // Create a block for the increment. auto Continue = getJumpDestInCurrentScope("omp.dispatch.inc"); BreakContinueStack.push_back(BreakContinue(LoopExit, Continue)); // Generate !llvm.loop.parallel metadata for loads and stores for loops // with dynamic/guided scheduling and without ordered clause. if (!isOpenMPSimdDirective(S.getDirectiveKind())) LoopStack.setParallel(!IsMonotonic); else EmitOMPSimdInit(S, IsMonotonic); SourceLocation Loc = S.getLocStart(); EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(), S.getInc(), [&S, LoopExit](CodeGenFunction &CGF) { CGF.EmitOMPLoopBody(S, LoopExit); CGF.EmitStopPoint(&S); }, [Ordered, IVSize, IVSigned, Loc](CodeGenFunction &CGF) { if (Ordered) { CGF.CGM.getOpenMPRuntime().emitForOrderedIterationEnd( CGF, Loc, IVSize, IVSigned); } }); EmitBlock(Continue.getBlock()); BreakContinueStack.pop_back(); if (!DynamicOrOrdered) { // Emit "LB = LB + Stride", "UB = UB + Stride". EmitIgnoredExpr(S.getNextLowerBound()); EmitIgnoredExpr(S.getNextUpperBound()); } EmitBranch(CondBlock); LoopStack.pop(); // Emit the fall-through block. EmitBlock(LoopExit.getBlock()); // Tell the runtime we are done. if (!DynamicOrOrdered) RT.emitForStaticFinish(*this, S.getLocEnd()); } /// \brief Emit a helper variable and return corresponding lvalue. static LValue EmitOMPHelperVar(CodeGenFunction &CGF, const DeclRefExpr *Helper) { auto VDecl = cast(Helper->getDecl()); CGF.EmitVarDecl(*VDecl); return CGF.EmitLValue(Helper); } namespace { struct ScheduleKindModifiersTy { OpenMPScheduleClauseKind Kind; OpenMPScheduleClauseModifier M1; OpenMPScheduleClauseModifier M2; ScheduleKindModifiersTy(OpenMPScheduleClauseKind Kind, OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2) : Kind(Kind), M1(M1), M2(M2) {} }; } // namespace static std::pair emitScheduleClause(CodeGenFunction &CGF, const OMPLoopDirective &S, bool OuterRegion) { // Detect the loop schedule kind and chunk. auto ScheduleKind = OMPC_SCHEDULE_unknown; OpenMPScheduleClauseModifier M1 = OMPC_SCHEDULE_MODIFIER_unknown; OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown; llvm::Value *Chunk = nullptr; if (const auto *C = S.getSingleClause()) { ScheduleKind = C->getScheduleKind(); M1 = C->getFirstScheduleModifier(); M2 = C->getSecondScheduleModifier(); if (const auto *Ch = C->getChunkSize()) { if (auto *ImpRef = cast_or_null(C->getHelperChunkSize())) { if (OuterRegion) { const VarDecl *ImpVar = cast(ImpRef->getDecl()); CGF.EmitVarDecl(*ImpVar); CGF.EmitStoreThroughLValue( CGF.EmitAnyExpr(Ch), CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(ImpVar), ImpVar->getType())); } else { Ch = ImpRef; } } if (!C->getHelperChunkSize() || !OuterRegion) { Chunk = CGF.EmitScalarExpr(Ch); Chunk = CGF.EmitScalarConversion(Chunk, Ch->getType(), S.getIterationVariable()->getType(), S.getLocStart()); } } } return std::make_pair(Chunk, ScheduleKindModifiersTy(ScheduleKind, M1, M2)); } bool CodeGenFunction::EmitOMPWorksharingLoop(const OMPLoopDirective &S) { // Emit the loop iteration variable. auto IVExpr = cast(S.getIterationVariable()); auto IVDecl = cast(IVExpr->getDecl()); EmitVarDecl(*IVDecl); // Emit the iterations count variable. // If it is not a variable, Sema decided to calculate iterations count on each // iteration (e.g., it is foldable into a constant). if (auto LIExpr = dyn_cast(S.getLastIteration())) { EmitVarDecl(*cast(LIExpr->getDecl())); // Emit calculation of the iterations count. EmitIgnoredExpr(S.getCalcLastIteration()); } auto &RT = CGM.getOpenMPRuntime(); bool HasLastprivateClause; // Check pre-condition. { // Skip the entire loop if we don't meet the precondition. // If the condition constant folds and can be elided, avoid emitting the // whole loop. bool CondConstant; llvm::BasicBlock *ContBlock = nullptr; if (ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) { if (!CondConstant) return false; } else { auto *ThenBlock = createBasicBlock("omp.precond.then"); ContBlock = createBasicBlock("omp.precond.end"); emitPreCond(*this, S, S.getPreCond(), ThenBlock, ContBlock, getProfileCount(&S)); EmitBlock(ThenBlock); incrementProfileCounter(&S); } emitAlignedClause(*this, S); EmitOMPLinearClauseInit(S); // Emit 'then' code. { // Emit helper vars inits. LValue LB = EmitOMPHelperVar(*this, cast(S.getLowerBoundVariable())); LValue UB = EmitOMPHelperVar(*this, cast(S.getUpperBoundVariable())); LValue ST = EmitOMPHelperVar(*this, cast(S.getStrideVariable())); LValue IL = EmitOMPHelperVar(*this, cast(S.getIsLastIterVariable())); OMPPrivateScope LoopScope(*this); if (EmitOMPFirstprivateClause(S, LoopScope)) { // Emit implicit barrier to synchronize threads and avoid data races on // initialization of firstprivate variables. CGM.getOpenMPRuntime().emitBarrierCall( *this, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false, /*ForceSimpleCall=*/true); } EmitOMPPrivateClause(S, LoopScope); HasLastprivateClause = EmitOMPLastprivateClauseInit(S, LoopScope); EmitOMPReductionClauseInit(S, LoopScope); emitPrivateLoopCounters(*this, LoopScope, S.counters(), S.private_counters()); emitPrivateLinearVars(*this, S, LoopScope); (void)LoopScope.Privatize(); // Detect the loop schedule kind and chunk. llvm::Value *Chunk; OpenMPScheduleClauseKind ScheduleKind; auto ScheduleInfo = emitScheduleClause(*this, S, /*OuterRegion=*/false); Chunk = ScheduleInfo.first; ScheduleKind = ScheduleInfo.second.Kind; const OpenMPScheduleClauseModifier M1 = ScheduleInfo.second.M1; const OpenMPScheduleClauseModifier M2 = ScheduleInfo.second.M2; const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); const bool Ordered = S.getSingleClause() != nullptr; // OpenMP 4.5, 2.7.1 Loop Construct, Description. // If the static schedule kind is specified or if the ordered clause is // specified, and if no monotonic modifier is specified, the effect will // be as if the monotonic modifier was specified. if (RT.isStaticNonchunked(ScheduleKind, /* Chunked */ Chunk != nullptr) && !Ordered) { if (isOpenMPSimdDirective(S.getDirectiveKind())) EmitOMPSimdInit(S, /*IsMonotonic=*/true); // OpenMP [2.7.1, Loop Construct, Description, table 2-1] // When no chunk_size is specified, the iteration space is divided into // chunks that are approximately equal in size, and at most one chunk is // distributed to each thread. Note that the size of the chunks is // unspecified in this case. RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, Ordered, IL.getAddress(), LB.getAddress(), UB.getAddress(), ST.getAddress()); auto LoopExit = getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit")); // UB = min(UB, GlobalUB); EmitIgnoredExpr(S.getEnsureUpperBound()); // IV = LB; EmitIgnoredExpr(S.getInit()); // while (idx <= UB) { BODY; ++idx; } EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(), S.getInc(), [&S, LoopExit](CodeGenFunction &CGF) { CGF.EmitOMPLoopBody(S, LoopExit); CGF.EmitStopPoint(&S); }, [](CodeGenFunction &) {}); EmitBlock(LoopExit.getBlock()); // Tell the runtime we are done. RT.emitForStaticFinish(*this, S.getLocStart()); } else { const bool IsMonotonic = Ordered || ScheduleKind == OMPC_SCHEDULE_static || ScheduleKind == OMPC_SCHEDULE_unknown || M1 == OMPC_SCHEDULE_MODIFIER_monotonic || M2 == OMPC_SCHEDULE_MODIFIER_monotonic; // Emit the outer loop, which requests its work chunk [LB..UB] from // runtime and runs the inner loop to process it. EmitOMPForOuterLoop(ScheduleKind, IsMonotonic, S, LoopScope, Ordered, LB.getAddress(), UB.getAddress(), ST.getAddress(), IL.getAddress(), Chunk); } EmitOMPReductionClauseFinal(S); // Emit final copy of the lastprivate variables if IsLastIter != 0. if (HasLastprivateClause) EmitOMPLastprivateClauseFinal( S, Builder.CreateIsNotNull(EmitLoadOfScalar(IL, S.getLocStart()))); } if (isOpenMPSimdDirective(S.getDirectiveKind())) { EmitOMPSimdFinal(S); } // We're now done with the loop, so jump to the continuation block. if (ContBlock) { EmitBranch(ContBlock); EmitBlock(ContBlock, true); } } return HasLastprivateClause; } void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); bool HasLastprivates = false; auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF) { HasLastprivates = CGF.EmitOMPWorksharingLoop(S); }; CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_for, CodeGen, S.hasCancel()); // Emit an implicit barrier at the end. if (!S.getSingleClause() || HasLastprivates) { CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_for); } } void CodeGenFunction::EmitOMPForSimdDirective(const OMPForSimdDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); bool HasLastprivates = false; auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF) { HasLastprivates = CGF.EmitOMPWorksharingLoop(S); }; CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen); // Emit an implicit barrier at the end. if (!S.getSingleClause() || HasLastprivates) { CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_for); } } static LValue createSectionLVal(CodeGenFunction &CGF, QualType Ty, const Twine &Name, llvm::Value *Init = nullptr) { auto LVal = CGF.MakeAddrLValue(CGF.CreateMemTemp(Ty, Name), Ty); if (Init) CGF.EmitScalarInit(Init, LVal); return LVal; } OpenMPDirectiveKind CodeGenFunction::EmitSections(const OMPExecutableDirective &S) { auto *Stmt = cast(S.getAssociatedStmt())->getCapturedStmt(); auto *CS = dyn_cast(Stmt); if (CS && CS->size() > 1) { bool HasLastprivates = false; auto &&CodeGen = [&S, CS, &HasLastprivates](CodeGenFunction &CGF) { auto &C = CGF.CGM.getContext(); auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1); // Emit helper vars inits. LValue LB = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.lb.", CGF.Builder.getInt32(0)); auto *GlobalUBVal = CGF.Builder.getInt32(CS->size() - 1); LValue UB = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.ub.", GlobalUBVal); LValue ST = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.st.", CGF.Builder.getInt32(1)); LValue IL = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.il.", CGF.Builder.getInt32(0)); // Loop counter. LValue IV = createSectionLVal(CGF, KmpInt32Ty, ".omp.sections.iv."); OpaqueValueExpr IVRefExpr(S.getLocStart(), KmpInt32Ty, VK_LValue); CodeGenFunction::OpaqueValueMapping OpaqueIV(CGF, &IVRefExpr, IV); OpaqueValueExpr UBRefExpr(S.getLocStart(), KmpInt32Ty, VK_LValue); CodeGenFunction::OpaqueValueMapping OpaqueUB(CGF, &UBRefExpr, UB); // Generate condition for loop. BinaryOperator Cond(&IVRefExpr, &UBRefExpr, BO_LE, C.BoolTy, VK_RValue, OK_Ordinary, S.getLocStart(), /*fpContractable=*/false); // Increment for loop counter. UnaryOperator Inc(&IVRefExpr, UO_PreInc, KmpInt32Ty, VK_RValue, OK_Ordinary, S.getLocStart()); auto BodyGen = [CS, &S, &IV](CodeGenFunction &CGF) { // Iterate through all sections and emit a switch construct: // switch (IV) { // case 0: // ; // break; // ... // case - 1: // - 1]>; // break; // } // .omp.sections.exit: auto *ExitBB = CGF.createBasicBlock(".omp.sections.exit"); auto *SwitchStmt = CGF.Builder.CreateSwitch( CGF.EmitLoadOfLValue(IV, S.getLocStart()).getScalarVal(), ExitBB, CS->size()); unsigned CaseNumber = 0; for (auto *SubStmt : CS->children()) { auto CaseBB = CGF.createBasicBlock(".omp.sections.case"); CGF.EmitBlock(CaseBB); SwitchStmt->addCase(CGF.Builder.getInt32(CaseNumber), CaseBB); CGF.EmitStmt(SubStmt); CGF.EmitBranch(ExitBB); ++CaseNumber; } CGF.EmitBlock(ExitBB, /*IsFinished=*/true); }; CodeGenFunction::OMPPrivateScope LoopScope(CGF); if (CGF.EmitOMPFirstprivateClause(S, LoopScope)) { // Emit implicit barrier to synchronize threads and avoid data races on // initialization of firstprivate variables. CGF.CGM.getOpenMPRuntime().emitBarrierCall( CGF, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false, /*ForceSimpleCall=*/true); } CGF.EmitOMPPrivateClause(S, LoopScope); HasLastprivates = CGF.EmitOMPLastprivateClauseInit(S, LoopScope); CGF.EmitOMPReductionClauseInit(S, LoopScope); (void)LoopScope.Privatize(); // Emit static non-chunked loop. CGF.CGM.getOpenMPRuntime().emitForStaticInit( CGF, S.getLocStart(), OMPC_SCHEDULE_static, /*IVSize=*/32, /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(), LB.getAddress(), UB.getAddress(), ST.getAddress()); // UB = min(UB, GlobalUB); auto *UBVal = CGF.EmitLoadOfScalar(UB, S.getLocStart()); auto *MinUBGlobalUB = CGF.Builder.CreateSelect( CGF.Builder.CreateICmpSLT(UBVal, GlobalUBVal), UBVal, GlobalUBVal); CGF.EmitStoreOfScalar(MinUBGlobalUB, UB); // IV = LB; CGF.EmitStoreOfScalar(CGF.EmitLoadOfScalar(LB, S.getLocStart()), IV); // while (idx <= UB) { BODY; ++idx; } CGF.EmitOMPInnerLoop(S, /*RequiresCleanup=*/false, &Cond, &Inc, BodyGen, [](CodeGenFunction &) {}); // Tell the runtime we are done. CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocStart()); CGF.EmitOMPReductionClauseFinal(S); // Emit final copy of the lastprivate variables if IsLastIter != 0. if (HasLastprivates) CGF.EmitOMPLastprivateClauseFinal( S, CGF.Builder.CreateIsNotNull( CGF.EmitLoadOfScalar(IL, S.getLocStart()))); }; bool HasCancel = false; if (auto *OSD = dyn_cast(&S)) HasCancel = OSD->hasCancel(); else if (auto *OPSD = dyn_cast(&S)) HasCancel = OPSD->hasCancel(); CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_sections, CodeGen, HasCancel); // Emit barrier for lastprivates only if 'sections' directive has 'nowait' // clause. Otherwise the barrier will be generated by the codegen for the // directive. if (HasLastprivates && S.getSingleClause()) { // Emit implicit barrier to synchronize threads and avoid data races on // initialization of firstprivate variables. CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_unknown); } return OMPD_sections; } // If only one section is found - no need to generate loop, emit as a single // region. bool HasFirstprivates; // No need to generate reductions for sections with single section region, we // can use original shared variables for all operations. bool HasReductions = S.hasClausesOfKind(); // No need to generate lastprivates for sections with single section region, // we can use original shared variable for all calculations with barrier at // the end of the sections. bool HasLastprivates = S.hasClausesOfKind(); auto &&CodeGen = [Stmt, &S, &HasFirstprivates](CodeGenFunction &CGF) { CodeGenFunction::OMPPrivateScope SingleScope(CGF); HasFirstprivates = CGF.EmitOMPFirstprivateClause(S, SingleScope); CGF.EmitOMPPrivateClause(S, SingleScope); (void)SingleScope.Privatize(); CGF.EmitStmt(Stmt); }; CGM.getOpenMPRuntime().emitSingleRegion(*this, CodeGen, S.getLocStart(), llvm::None, llvm::None, llvm::None, llvm::None); // Emit barrier for firstprivates, lastprivates or reductions only if // 'sections' directive has 'nowait' clause. Otherwise the barrier will be // generated by the codegen for the directive. if ((HasFirstprivates || HasLastprivates || HasReductions) && S.getSingleClause()) { // Emit implicit barrier to synchronize threads and avoid data races on // initialization of firstprivate variables. CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_unknown, /*EmitChecks=*/false, /*ForceSimpleCall=*/true); } return OMPD_single; } void CodeGenFunction::EmitOMPSectionsDirective(const OMPSectionsDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); OpenMPDirectiveKind EmittedAs = EmitSections(S); // Emit an implicit barrier at the end. if (!S.getSingleClause()) { CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), EmittedAs); } } void CodeGenFunction::EmitOMPSectionDirective(const OMPSectionDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); auto &&CodeGen = [&S](CodeGenFunction &CGF) { CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); }; CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_section, CodeGen, S.hasCancel()); } void CodeGenFunction::EmitOMPSingleDirective(const OMPSingleDirective &S) { llvm::SmallVector CopyprivateVars; llvm::SmallVector DestExprs; llvm::SmallVector SrcExprs; llvm::SmallVector AssignmentOps; // Check if there are any 'copyprivate' clauses associated with this // 'single' // construct. // Build a list of copyprivate variables along with helper expressions // (, , = expressions) for (const auto *C : S.getClausesOfKind()) { CopyprivateVars.append(C->varlists().begin(), C->varlists().end()); DestExprs.append(C->destination_exprs().begin(), C->destination_exprs().end()); SrcExprs.append(C->source_exprs().begin(), C->source_exprs().end()); AssignmentOps.append(C->assignment_ops().begin(), C->assignment_ops().end()); } LexicalScope Scope(*this, S.getSourceRange()); // Emit code for 'single' region along with 'copyprivate' clauses bool HasFirstprivates; auto &&CodeGen = [&S, &HasFirstprivates](CodeGenFunction &CGF) { CodeGenFunction::OMPPrivateScope SingleScope(CGF); HasFirstprivates = CGF.EmitOMPFirstprivateClause(S, SingleScope); CGF.EmitOMPPrivateClause(S, SingleScope); (void)SingleScope.Privatize(); CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); }; CGM.getOpenMPRuntime().emitSingleRegion(*this, CodeGen, S.getLocStart(), CopyprivateVars, DestExprs, SrcExprs, AssignmentOps); // Emit an implicit barrier at the end (to avoid data race on firstprivate // init or if no 'nowait' clause was specified and no 'copyprivate' clause). if ((!S.getSingleClause() || HasFirstprivates) && CopyprivateVars.empty()) { CGM.getOpenMPRuntime().emitBarrierCall( *this, S.getLocStart(), S.getSingleClause() ? OMPD_unknown : OMPD_single); } } void CodeGenFunction::EmitOMPMasterDirective(const OMPMasterDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); auto &&CodeGen = [&S](CodeGenFunction &CGF) { CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); }; CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getLocStart()); } void CodeGenFunction::EmitOMPCriticalDirective(const OMPCriticalDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); auto &&CodeGen = [&S](CodeGenFunction &CGF) { CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); }; Expr *Hint = nullptr; if (auto *HintClause = S.getSingleClause()) Hint = HintClause->getHint(); CGM.getOpenMPRuntime().emitCriticalRegion(*this, S.getDirectiveName().getAsString(), CodeGen, S.getLocStart(), Hint); } void CodeGenFunction::EmitOMPParallelForDirective( const OMPParallelForDirective &S) { // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'for' directive. LexicalScope Scope(*this, S.getSourceRange()); (void)emitScheduleClause(*this, S, /*OuterRegion=*/true); auto &&CodeGen = [&S](CodeGenFunction &CGF) { CGF.EmitOMPWorksharingLoop(S); }; emitCommonOMPParallelDirective(*this, S, OMPD_for, CodeGen); } void CodeGenFunction::EmitOMPParallelForSimdDirective( const OMPParallelForSimdDirective &S) { // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'for' directive. LexicalScope Scope(*this, S.getSourceRange()); (void)emitScheduleClause(*this, S, /*OuterRegion=*/true); auto &&CodeGen = [&S](CodeGenFunction &CGF) { CGF.EmitOMPWorksharingLoop(S); }; emitCommonOMPParallelDirective(*this, S, OMPD_simd, CodeGen); } void CodeGenFunction::EmitOMPParallelSectionsDirective( const OMPParallelSectionsDirective &S) { // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'sections' directive. LexicalScope Scope(*this, S.getSourceRange()); auto &&CodeGen = [&S](CodeGenFunction &CGF) { (void)CGF.EmitSections(S); }; emitCommonOMPParallelDirective(*this, S, OMPD_sections, CodeGen); } void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) { // Emit outlined function for task construct. LexicalScope Scope(*this, S.getSourceRange()); auto CS = cast(S.getAssociatedStmt()); auto CapturedStruct = GenerateCapturedStmtArgument(*CS); auto *I = CS->getCapturedDecl()->param_begin(); auto *PartId = std::next(I); // The first function argument for tasks is a thread id, the second one is a // part id (0 for tied tasks, >=0 for untied task). llvm::DenseSet EmittedAsPrivate; // Get list of private variables. llvm::SmallVector PrivateVars; llvm::SmallVector PrivateCopies; for (const auto *C : S.getClausesOfKind()) { auto IRef = C->varlist_begin(); for (auto *IInit : C->private_copies()) { auto *OrigVD = cast(cast(*IRef)->getDecl()); if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) { PrivateVars.push_back(*IRef); PrivateCopies.push_back(IInit); } ++IRef; } } EmittedAsPrivate.clear(); // Get list of firstprivate variables. llvm::SmallVector FirstprivateVars; llvm::SmallVector FirstprivateCopies; llvm::SmallVector FirstprivateInits; for (const auto *C : S.getClausesOfKind()) { auto IRef = C->varlist_begin(); auto IElemInitRef = C->inits().begin(); for (auto *IInit : C->private_copies()) { auto *OrigVD = cast(cast(*IRef)->getDecl()); if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) { FirstprivateVars.push_back(*IRef); FirstprivateCopies.push_back(IInit); FirstprivateInits.push_back(*IElemInitRef); } ++IRef, ++IElemInitRef; } } // Build list of dependences. llvm::SmallVector, 8> Dependences; for (const auto *C : S.getClausesOfKind()) { for (auto *IRef : C->varlists()) { Dependences.push_back(std::make_pair(C->getDependencyKind(), IRef)); } } auto &&CodeGen = [PartId, &S, &PrivateVars, &FirstprivateVars]( CodeGenFunction &CGF) { // Set proper addresses for generated private copies. auto *CS = cast(S.getAssociatedStmt()); OMPPrivateScope Scope(CGF); if (!PrivateVars.empty() || !FirstprivateVars.empty()) { auto *CopyFn = CGF.Builder.CreateLoad( CGF.GetAddrOfLocalVar(CS->getCapturedDecl()->getParam(3))); auto *PrivatesPtr = CGF.Builder.CreateLoad( CGF.GetAddrOfLocalVar(CS->getCapturedDecl()->getParam(2))); // Map privates. llvm::SmallVector, 16> PrivatePtrs; llvm::SmallVector CallArgs; CallArgs.push_back(PrivatesPtr); for (auto *E : PrivateVars) { auto *VD = cast(cast(E)->getDecl()); Address PrivatePtr = CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType())); PrivatePtrs.push_back(std::make_pair(VD, PrivatePtr)); CallArgs.push_back(PrivatePtr.getPointer()); } for (auto *E : FirstprivateVars) { auto *VD = cast(cast(E)->getDecl()); Address PrivatePtr = CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType())); PrivatePtrs.push_back(std::make_pair(VD, PrivatePtr)); CallArgs.push_back(PrivatePtr.getPointer()); } CGF.EmitRuntimeCall(CopyFn, CallArgs); for (auto &&Pair : PrivatePtrs) { Address Replacement(CGF.Builder.CreateLoad(Pair.second), CGF.getContext().getDeclAlign(Pair.first)); Scope.addPrivate(Pair.first, [Replacement]() { return Replacement; }); } } (void)Scope.Privatize(); if (*PartId) { // TODO: emit code for untied tasks. } CGF.EmitStmt(CS->getCapturedStmt()); }; auto OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction( S, *I, OMPD_task, CodeGen); // Check if we should emit tied or untied task. bool Tied = !S.getSingleClause(); // Check if the task is final llvm::PointerIntPair Final; if (const auto *Clause = S.getSingleClause()) { // If the condition constant folds and can be elided, try to avoid emitting // the condition and the dead arm of the if/else. auto *Cond = Clause->getCondition(); bool CondConstant; if (ConstantFoldsToSimpleInteger(Cond, CondConstant)) Final.setInt(CondConstant); else Final.setPointer(EvaluateExprAsBool(Cond)); } else { // By default the task is not final. Final.setInt(/*IntVal=*/false); } auto SharedsTy = getContext().getRecordType(CS->getCapturedRecordDecl()); const Expr *IfCond = nullptr; for (const auto *C : S.getClausesOfKind()) { if (C->getNameModifier() == OMPD_unknown || C->getNameModifier() == OMPD_task) { IfCond = C->getCondition(); break; } } CGM.getOpenMPRuntime().emitTaskCall( *this, S.getLocStart(), S, Tied, Final, OutlinedFn, SharedsTy, CapturedStruct, IfCond, PrivateVars, PrivateCopies, FirstprivateVars, FirstprivateCopies, FirstprivateInits, Dependences); } void CodeGenFunction::EmitOMPTaskyieldDirective( const OMPTaskyieldDirective &S) { CGM.getOpenMPRuntime().emitTaskyieldCall(*this, S.getLocStart()); } void CodeGenFunction::EmitOMPBarrierDirective(const OMPBarrierDirective &S) { CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_barrier); } void CodeGenFunction::EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S) { CGM.getOpenMPRuntime().emitTaskwaitCall(*this, S.getLocStart()); } void CodeGenFunction::EmitOMPTaskgroupDirective( const OMPTaskgroupDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); auto &&CodeGen = [&S](CodeGenFunction &CGF) { CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); }; CGM.getOpenMPRuntime().emitTaskgroupRegion(*this, CodeGen, S.getLocStart()); } void CodeGenFunction::EmitOMPFlushDirective(const OMPFlushDirective &S) { CGM.getOpenMPRuntime().emitFlush(*this, [&]() -> ArrayRef { if (const auto *FlushClause = S.getSingleClause()) { return llvm::makeArrayRef(FlushClause->varlist_begin(), FlushClause->varlist_end()); } return llvm::None; }(), S.getLocStart()); } void CodeGenFunction::EmitOMPDistributeDirective( const OMPDistributeDirective &S) { llvm_unreachable("CodeGen for 'omp distribute' is not supported yet."); } static llvm::Function *emitOutlinedOrderedFunction(CodeGenModule &CGM, const CapturedStmt *S) { CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CodeGenFunction::CGCapturedStmtInfo CapStmtInfo; CGF.CapturedStmtInfo = &CapStmtInfo; auto *Fn = CGF.GenerateOpenMPCapturedStmtFunction(*S); Fn->addFnAttr(llvm::Attribute::NoInline); return Fn; } void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) { if (!S.getAssociatedStmt()) return; LexicalScope Scope(*this, S.getSourceRange()); auto *C = S.getSingleClause(); auto &&CodeGen = [&S, C, this](CodeGenFunction &CGF) { if (C) { auto CS = cast(S.getAssociatedStmt()); llvm::SmallVector CapturedVars; CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); auto *OutlinedFn = emitOutlinedOrderedFunction(CGM, CS); CGF.EmitNounwindRuntimeCall(OutlinedFn, CapturedVars); } else { CGF.EmitStmt( cast(S.getAssociatedStmt())->getCapturedStmt()); } }; CGM.getOpenMPRuntime().emitOrderedRegion(*this, CodeGen, S.getLocStart(), !C); } static llvm::Value *convertToScalarValue(CodeGenFunction &CGF, RValue Val, QualType SrcType, QualType DestType, SourceLocation Loc) { assert(CGF.hasScalarEvaluationKind(DestType) && "DestType must have scalar evaluation kind."); assert(!Val.isAggregate() && "Must be a scalar or complex."); return Val.isScalar() ? CGF.EmitScalarConversion(Val.getScalarVal(), SrcType, DestType, Loc) : CGF.EmitComplexToScalarConversion(Val.getComplexVal(), SrcType, DestType, Loc); } static CodeGenFunction::ComplexPairTy convertToComplexValue(CodeGenFunction &CGF, RValue Val, QualType SrcType, QualType DestType, SourceLocation Loc) { assert(CGF.getEvaluationKind(DestType) == TEK_Complex && "DestType must have complex evaluation kind."); CodeGenFunction::ComplexPairTy ComplexVal; if (Val.isScalar()) { // Convert the input element to the element type of the complex. auto DestElementType = DestType->castAs()->getElementType(); auto ScalarVal = CGF.EmitScalarConversion(Val.getScalarVal(), SrcType, DestElementType, Loc); ComplexVal = CodeGenFunction::ComplexPairTy( ScalarVal, llvm::Constant::getNullValue(ScalarVal->getType())); } else { assert(Val.isComplex() && "Must be a scalar or complex."); auto SrcElementType = SrcType->castAs()->getElementType(); auto DestElementType = DestType->castAs()->getElementType(); ComplexVal.first = CGF.EmitScalarConversion( Val.getComplexVal().first, SrcElementType, DestElementType, Loc); ComplexVal.second = CGF.EmitScalarConversion( Val.getComplexVal().second, SrcElementType, DestElementType, Loc); } return ComplexVal; } static void emitSimpleAtomicStore(CodeGenFunction &CGF, bool IsSeqCst, LValue LVal, RValue RVal) { if (LVal.isGlobalReg()) { CGF.EmitStoreThroughGlobalRegLValue(RVal, LVal); } else { CGF.EmitAtomicStore(RVal, LVal, IsSeqCst ? llvm::SequentiallyConsistent : llvm::Monotonic, LVal.isVolatile(), /*IsInit=*/false); } } -static void emitSimpleStore(CodeGenFunction &CGF, LValue LVal, RValue RVal, - QualType RValTy, SourceLocation Loc) { - switch (CGF.getEvaluationKind(LVal.getType())) { +void CodeGenFunction::emitOMPSimpleStore(LValue LVal, RValue RVal, + QualType RValTy, SourceLocation Loc) { + switch (getEvaluationKind(LVal.getType())) { case TEK_Scalar: - CGF.EmitStoreThroughLValue(RValue::get(convertToScalarValue( - CGF, RVal, RValTy, LVal.getType(), Loc)), - LVal); + EmitStoreThroughLValue(RValue::get(convertToScalarValue( + *this, RVal, RValTy, LVal.getType(), Loc)), + LVal); break; case TEK_Complex: - CGF.EmitStoreOfComplex( - convertToComplexValue(CGF, RVal, RValTy, LVal.getType(), Loc), LVal, + EmitStoreOfComplex( + convertToComplexValue(*this, RVal, RValTy, LVal.getType(), Loc), LVal, /*isInit=*/false); break; case TEK_Aggregate: llvm_unreachable("Must be a scalar or complex."); } } static void EmitOMPAtomicReadExpr(CodeGenFunction &CGF, bool IsSeqCst, const Expr *X, const Expr *V, SourceLocation Loc) { // v = x; assert(V->isLValue() && "V of 'omp atomic read' is not lvalue"); assert(X->isLValue() && "X of 'omp atomic read' is not lvalue"); LValue XLValue = CGF.EmitLValue(X); LValue VLValue = CGF.EmitLValue(V); RValue Res = XLValue.isGlobalReg() ? CGF.EmitLoadOfLValue(XLValue, Loc) : CGF.EmitAtomicLoad(XLValue, Loc, IsSeqCst ? llvm::SequentiallyConsistent : llvm::Monotonic, XLValue.isVolatile()); // OpenMP, 2.12.6, atomic Construct // Any atomic construct with a seq_cst clause forces the atomically // performed operation to include an implicit flush operation without a // list. if (IsSeqCst) CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc); - emitSimpleStore(CGF, VLValue, Res, X->getType().getNonReferenceType(), Loc); + CGF.emitOMPSimpleStore(VLValue, Res, X->getType().getNonReferenceType(), Loc); } static void EmitOMPAtomicWriteExpr(CodeGenFunction &CGF, bool IsSeqCst, const Expr *X, const Expr *E, SourceLocation Loc) { // x = expr; assert(X->isLValue() && "X of 'omp atomic write' is not lvalue"); emitSimpleAtomicStore(CGF, IsSeqCst, CGF.EmitLValue(X), CGF.EmitAnyExpr(E)); // OpenMP, 2.12.6, atomic Construct // Any atomic construct with a seq_cst clause forces the atomically // performed operation to include an implicit flush operation without a // list. if (IsSeqCst) CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc); } static std::pair emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X, RValue Update, BinaryOperatorKind BO, llvm::AtomicOrdering AO, bool IsXLHSInRHSPart) { auto &Context = CGF.CGM.getContext(); // Allow atomicrmw only if 'x' and 'update' are integer values, lvalue for 'x' // expression is simple and atomic is allowed for the given type for the // target platform. if (BO == BO_Comma || !Update.isScalar() || !Update.getScalarVal()->getType()->isIntegerTy() || !X.isSimple() || (!isa(Update.getScalarVal()) && (Update.getScalarVal()->getType() != X.getAddress().getElementType())) || !X.getAddress().getElementType()->isIntegerTy() || !Context.getTargetInfo().hasBuiltinAtomic( Context.getTypeSize(X.getType()), Context.toBits(X.getAlignment()))) return std::make_pair(false, RValue::get(nullptr)); llvm::AtomicRMWInst::BinOp RMWOp; switch (BO) { case BO_Add: RMWOp = llvm::AtomicRMWInst::Add; break; case BO_Sub: if (!IsXLHSInRHSPart) return std::make_pair(false, RValue::get(nullptr)); RMWOp = llvm::AtomicRMWInst::Sub; break; case BO_And: RMWOp = llvm::AtomicRMWInst::And; break; case BO_Or: RMWOp = llvm::AtomicRMWInst::Or; break; case BO_Xor: RMWOp = llvm::AtomicRMWInst::Xor; break; case BO_LT: RMWOp = X.getType()->hasSignedIntegerRepresentation() ? (IsXLHSInRHSPart ? llvm::AtomicRMWInst::Min : llvm::AtomicRMWInst::Max) : (IsXLHSInRHSPart ? llvm::AtomicRMWInst::UMin : llvm::AtomicRMWInst::UMax); break; case BO_GT: RMWOp = X.getType()->hasSignedIntegerRepresentation() ? (IsXLHSInRHSPart ? llvm::AtomicRMWInst::Max : llvm::AtomicRMWInst::Min) : (IsXLHSInRHSPart ? llvm::AtomicRMWInst::UMax : llvm::AtomicRMWInst::UMin); break; case BO_Assign: RMWOp = llvm::AtomicRMWInst::Xchg; break; case BO_Mul: case BO_Div: case BO_Rem: case BO_Shl: case BO_Shr: case BO_LAnd: case BO_LOr: return std::make_pair(false, RValue::get(nullptr)); case BO_PtrMemD: case BO_PtrMemI: case BO_LE: case BO_GE: case BO_EQ: case BO_NE: case BO_AddAssign: case BO_SubAssign: case BO_AndAssign: case BO_OrAssign: case BO_XorAssign: case BO_MulAssign: case BO_DivAssign: case BO_RemAssign: case BO_ShlAssign: case BO_ShrAssign: case BO_Comma: llvm_unreachable("Unsupported atomic update operation"); } auto *UpdateVal = Update.getScalarVal(); if (auto *IC = dyn_cast(UpdateVal)) { UpdateVal = CGF.Builder.CreateIntCast( IC, X.getAddress().getElementType(), X.getType()->hasSignedIntegerRepresentation()); } auto *Res = CGF.Builder.CreateAtomicRMW(RMWOp, X.getPointer(), UpdateVal, AO); return std::make_pair(true, RValue::get(Res)); } std::pair CodeGenFunction::EmitOMPAtomicSimpleUpdateExpr( LValue X, RValue E, BinaryOperatorKind BO, bool IsXLHSInRHSPart, llvm::AtomicOrdering AO, SourceLocation Loc, const llvm::function_ref &CommonGen) { // Update expressions are allowed to have the following forms: // x binop= expr; -> xrval + expr; // x++, ++x -> xrval + 1; // x--, --x -> xrval - 1; // x = x binop expr; -> xrval binop expr // x = expr Op x; - > expr binop xrval; auto Res = emitOMPAtomicRMW(*this, X, E, BO, AO, IsXLHSInRHSPart); if (!Res.first) { if (X.isGlobalReg()) { // Emit an update expression: 'xrval' binop 'expr' or 'expr' binop // 'xrval'. EmitStoreThroughLValue(CommonGen(EmitLoadOfLValue(X, Loc)), X); } else { // Perform compare-and-swap procedure. EmitAtomicUpdate(X, AO, CommonGen, X.getType().isVolatileQualified()); } } return Res; } static void EmitOMPAtomicUpdateExpr(CodeGenFunction &CGF, bool IsSeqCst, const Expr *X, const Expr *E, const Expr *UE, bool IsXLHSInRHSPart, SourceLocation Loc) { assert(isa(UE->IgnoreImpCasts()) && "Update expr in 'atomic update' must be a binary operator."); auto *BOUE = cast(UE->IgnoreImpCasts()); // Update expressions are allowed to have the following forms: // x binop= expr; -> xrval + expr; // x++, ++x -> xrval + 1; // x--, --x -> xrval - 1; // x = x binop expr; -> xrval binop expr // x = expr Op x; - > expr binop xrval; assert(X->isLValue() && "X of 'omp atomic update' is not lvalue"); LValue XLValue = CGF.EmitLValue(X); RValue ExprRValue = CGF.EmitAnyExpr(E); auto AO = IsSeqCst ? llvm::SequentiallyConsistent : llvm::Monotonic; auto *LHS = cast(BOUE->getLHS()->IgnoreImpCasts()); auto *RHS = cast(BOUE->getRHS()->IgnoreImpCasts()); auto *XRValExpr = IsXLHSInRHSPart ? LHS : RHS; auto *ERValExpr = IsXLHSInRHSPart ? RHS : LHS; auto Gen = [&CGF, UE, ExprRValue, XRValExpr, ERValExpr](RValue XRValue) -> RValue { CodeGenFunction::OpaqueValueMapping MapExpr(CGF, ERValExpr, ExprRValue); CodeGenFunction::OpaqueValueMapping MapX(CGF, XRValExpr, XRValue); return CGF.EmitAnyExpr(UE); }; (void)CGF.EmitOMPAtomicSimpleUpdateExpr( XLValue, ExprRValue, BOUE->getOpcode(), IsXLHSInRHSPart, AO, Loc, Gen); // OpenMP, 2.12.6, atomic Construct // Any atomic construct with a seq_cst clause forces the atomically // performed operation to include an implicit flush operation without a // list. if (IsSeqCst) CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc); } static RValue convertToType(CodeGenFunction &CGF, RValue Value, QualType SourceType, QualType ResType, SourceLocation Loc) { switch (CGF.getEvaluationKind(ResType)) { case TEK_Scalar: return RValue::get( convertToScalarValue(CGF, Value, SourceType, ResType, Loc)); case TEK_Complex: { auto Res = convertToComplexValue(CGF, Value, SourceType, ResType, Loc); return RValue::getComplex(Res.first, Res.second); } case TEK_Aggregate: break; } llvm_unreachable("Must be a scalar or complex."); } static void EmitOMPAtomicCaptureExpr(CodeGenFunction &CGF, bool IsSeqCst, bool IsPostfixUpdate, const Expr *V, const Expr *X, const Expr *E, const Expr *UE, bool IsXLHSInRHSPart, SourceLocation Loc) { assert(X->isLValue() && "X of 'omp atomic capture' is not lvalue"); assert(V->isLValue() && "V of 'omp atomic capture' is not lvalue"); RValue NewVVal; LValue VLValue = CGF.EmitLValue(V); LValue XLValue = CGF.EmitLValue(X); RValue ExprRValue = CGF.EmitAnyExpr(E); auto AO = IsSeqCst ? llvm::SequentiallyConsistent : llvm::Monotonic; QualType NewVValType; if (UE) { // 'x' is updated with some additional value. assert(isa(UE->IgnoreImpCasts()) && "Update expr in 'atomic capture' must be a binary operator."); auto *BOUE = cast(UE->IgnoreImpCasts()); // Update expressions are allowed to have the following forms: // x binop= expr; -> xrval + expr; // x++, ++x -> xrval + 1; // x--, --x -> xrval - 1; // x = x binop expr; -> xrval binop expr // x = expr Op x; - > expr binop xrval; auto *LHS = cast(BOUE->getLHS()->IgnoreImpCasts()); auto *RHS = cast(BOUE->getRHS()->IgnoreImpCasts()); auto *XRValExpr = IsXLHSInRHSPart ? LHS : RHS; NewVValType = XRValExpr->getType(); auto *ERValExpr = IsXLHSInRHSPart ? RHS : LHS; auto &&Gen = [&CGF, &NewVVal, UE, ExprRValue, XRValExpr, ERValExpr, IsSeqCst, IsPostfixUpdate](RValue XRValue) -> RValue { CodeGenFunction::OpaqueValueMapping MapExpr(CGF, ERValExpr, ExprRValue); CodeGenFunction::OpaqueValueMapping MapX(CGF, XRValExpr, XRValue); RValue Res = CGF.EmitAnyExpr(UE); NewVVal = IsPostfixUpdate ? XRValue : Res; return Res; }; auto Res = CGF.EmitOMPAtomicSimpleUpdateExpr( XLValue, ExprRValue, BOUE->getOpcode(), IsXLHSInRHSPart, AO, Loc, Gen); if (Res.first) { // 'atomicrmw' instruction was generated. if (IsPostfixUpdate) { // Use old value from 'atomicrmw'. NewVVal = Res.second; } else { // 'atomicrmw' does not provide new value, so evaluate it using old // value of 'x'. CodeGenFunction::OpaqueValueMapping MapExpr(CGF, ERValExpr, ExprRValue); CodeGenFunction::OpaqueValueMapping MapX(CGF, XRValExpr, Res.second); NewVVal = CGF.EmitAnyExpr(UE); } } } else { // 'x' is simply rewritten with some 'expr'. NewVValType = X->getType().getNonReferenceType(); ExprRValue = convertToType(CGF, ExprRValue, E->getType(), X->getType().getNonReferenceType(), Loc); auto &&Gen = [&CGF, &NewVVal, ExprRValue](RValue XRValue) -> RValue { NewVVal = XRValue; return ExprRValue; }; // Try to perform atomicrmw xchg, otherwise simple exchange. auto Res = CGF.EmitOMPAtomicSimpleUpdateExpr( XLValue, ExprRValue, /*BO=*/BO_Assign, /*IsXLHSInRHSPart=*/false, AO, Loc, Gen); if (Res.first) { // 'atomicrmw' instruction was generated. NewVVal = IsPostfixUpdate ? Res.second : ExprRValue; } } // Emit post-update store to 'v' of old/new 'x' value. - emitSimpleStore(CGF, VLValue, NewVVal, NewVValType, Loc); + CGF.emitOMPSimpleStore(VLValue, NewVVal, NewVValType, Loc); // OpenMP, 2.12.6, atomic Construct // Any atomic construct with a seq_cst clause forces the atomically // performed operation to include an implicit flush operation without a // list. if (IsSeqCst) CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc); } static void EmitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind, bool IsSeqCst, bool IsPostfixUpdate, const Expr *X, const Expr *V, const Expr *E, const Expr *UE, bool IsXLHSInRHSPart, SourceLocation Loc) { switch (Kind) { case OMPC_read: EmitOMPAtomicReadExpr(CGF, IsSeqCst, X, V, Loc); break; case OMPC_write: EmitOMPAtomicWriteExpr(CGF, IsSeqCst, X, E, Loc); break; case OMPC_unknown: case OMPC_update: EmitOMPAtomicUpdateExpr(CGF, IsSeqCst, X, E, UE, IsXLHSInRHSPart, Loc); break; case OMPC_capture: EmitOMPAtomicCaptureExpr(CGF, IsSeqCst, IsPostfixUpdate, V, X, E, UE, IsXLHSInRHSPart, Loc); break; case OMPC_if: case OMPC_final: case OMPC_num_threads: case OMPC_private: case OMPC_firstprivate: case OMPC_lastprivate: case OMPC_reduction: case OMPC_safelen: case OMPC_simdlen: case OMPC_collapse: case OMPC_default: case OMPC_seq_cst: case OMPC_shared: case OMPC_linear: case OMPC_aligned: case OMPC_copyin: case OMPC_copyprivate: case OMPC_flush: case OMPC_proc_bind: case OMPC_schedule: case OMPC_ordered: case OMPC_nowait: case OMPC_untied: case OMPC_threadprivate: case OMPC_depend: case OMPC_mergeable: case OMPC_device: case OMPC_threads: case OMPC_simd: case OMPC_map: case OMPC_num_teams: case OMPC_thread_limit: case OMPC_priority: case OMPC_grainsize: case OMPC_nogroup: case OMPC_num_tasks: case OMPC_hint: llvm_unreachable("Clause is not allowed in 'omp atomic'."); } } void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) { bool IsSeqCst = S.getSingleClause(); OpenMPClauseKind Kind = OMPC_unknown; for (auto *C : S.clauses()) { // Find first clause (skip seq_cst clause, if it is first). if (C->getClauseKind() != OMPC_seq_cst) { Kind = C->getClauseKind(); break; } } const auto *CS = S.getAssociatedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); if (const auto *EWC = dyn_cast(CS)) { enterFullExpression(EWC); } // Processing for statements under 'atomic capture'. if (const auto *Compound = dyn_cast(CS)) { for (const auto *C : Compound->body()) { if (const auto *EWC = dyn_cast(C)) { enterFullExpression(EWC); } } } LexicalScope Scope(*this, S.getSourceRange()); auto &&CodeGen = [&S, Kind, IsSeqCst, CS](CodeGenFunction &CGF) { CGF.EmitStopPoint(CS); EmitOMPAtomicExpr(CGF, Kind, IsSeqCst, S.isPostfixUpdate(), S.getX(), S.getV(), S.getExpr(), S.getUpdateExpr(), S.isXLHSInRHSPart(), S.getLocStart()); }; CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_atomic, CodeGen); } void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); const CapturedStmt &CS = *cast(S.getAssociatedStmt()); llvm::SmallVector CapturedVars; GenerateOpenMPCapturedVars(CS, CapturedVars); llvm::Function *Fn = nullptr; llvm::Constant *FnID = nullptr; // Check if we have any if clause associated with the directive. const Expr *IfCond = nullptr; if (auto *C = S.getSingleClause()) { IfCond = C->getCondition(); } // Check if we have any device clause associated with the directive. const Expr *Device = nullptr; if (auto *C = S.getSingleClause()) { Device = C->getDevice(); } // Check if we have an if clause whose conditional always evaluates to false // or if we do not have any targets specified. If so the target region is not // an offload entry point. bool IsOffloadEntry = true; if (IfCond) { bool Val; if (ConstantFoldsToSimpleInteger(IfCond, Val) && !Val) IsOffloadEntry = false; } if (CGM.getLangOpts().OMPTargetTriples.empty()) IsOffloadEntry = false; assert(CurFuncDecl && "No parent declaration for target region!"); StringRef ParentName; // In case we have Ctors/Dtors we use the complete type variant to produce // the mangling of the device outlined kernel. if (auto *D = dyn_cast(CurFuncDecl)) ParentName = CGM.getMangledName(GlobalDecl(D, Ctor_Complete)); else if (auto *D = dyn_cast(CurFuncDecl)) ParentName = CGM.getMangledName(GlobalDecl(D, Dtor_Complete)); else ParentName = CGM.getMangledName(GlobalDecl(cast(CurFuncDecl))); CGM.getOpenMPRuntime().emitTargetOutlinedFunction(S, ParentName, Fn, FnID, IsOffloadEntry); CGM.getOpenMPRuntime().emitTargetCall(*this, S, Fn, FnID, IfCond, Device, CapturedVars); } void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &) { llvm_unreachable("CodeGen for 'omp teams' is not supported yet."); } void CodeGenFunction::EmitOMPCancellationPointDirective( const OMPCancellationPointDirective &S) { CGM.getOpenMPRuntime().emitCancellationPointCall(*this, S.getLocStart(), S.getCancelRegion()); } void CodeGenFunction::EmitOMPCancelDirective(const OMPCancelDirective &S) { const Expr *IfCond = nullptr; for (const auto *C : S.getClausesOfKind()) { if (C->getNameModifier() == OMPD_unknown || C->getNameModifier() == OMPD_cancel) { IfCond = C->getCondition(); break; } } CGM.getOpenMPRuntime().emitCancelCall(*this, S.getLocStart(), IfCond, S.getCancelRegion()); } CodeGenFunction::JumpDest CodeGenFunction::getOMPCancelDestination(OpenMPDirectiveKind Kind) { if (Kind == OMPD_parallel || Kind == OMPD_task) return ReturnBlock; assert(Kind == OMPD_for || Kind == OMPD_section || Kind == OMPD_sections || Kind == OMPD_parallel_sections || Kind == OMPD_parallel_for); return BreakContinueStack.back().BreakBlock; } // Generate the instructions for '#pragma omp target data' directive. void CodeGenFunction::EmitOMPTargetDataDirective( const OMPTargetDataDirective &S) { // emit the code inside the construct for now auto CS = cast(S.getAssociatedStmt()); CGM.getOpenMPRuntime().emitInlinedDirective( *this, OMPD_target_data, [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS->getCapturedStmt()); }); } void CodeGenFunction::EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S) { // emit the code inside the construct for now auto CS = cast(S.getAssociatedStmt()); CGM.getOpenMPRuntime().emitInlinedDirective( *this, OMPD_taskloop, [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS->getCapturedStmt()); }); } void CodeGenFunction::EmitOMPTaskLoopSimdDirective( const OMPTaskLoopSimdDirective &S) { // emit the code inside the construct for now auto CS = cast(S.getAssociatedStmt()); CGM.getOpenMPRuntime().emitInlinedDirective( *this, OMPD_taskloop_simd, [&CS](CodeGenFunction &CGF) { CGF.EmitStmt(CS->getCapturedStmt()); }); } Index: projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CodeGenFunction.h =================================================================== --- projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CodeGenFunction.h (revision 294608) +++ projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CodeGenFunction.h (revision 294609) @@ -1,3312 +1,3314 @@ //===-- CodeGenFunction.h - Per-Function state for LLVM CodeGen -*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This is the internal per-function state used for llvm translation. // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_LIB_CODEGEN_CODEGENFUNCTION_H #define LLVM_CLANG_LIB_CODEGEN_CODEGENFUNCTION_H #include "CGBuilder.h" #include "CGDebugInfo.h" #include "CGLoopInfo.h" #include "CGValue.h" #include "CodeGenModule.h" #include "CodeGenPGO.h" #include "EHScopeStack.h" #include "clang/AST/CharUnits.h" #include "clang/AST/ExprCXX.h" #include "clang/AST/ExprObjC.h" #include "clang/AST/ExprOpenMP.h" #include "clang/AST/Type.h" #include "clang/Basic/ABI.h" #include "clang/Basic/CapturedStmt.h" #include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/TargetInfo.h" #include "clang/Frontend/CodeGenOptions.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" namespace llvm { class BasicBlock; class LLVMContext; class MDNode; class Module; class SwitchInst; class Twine; class Value; class CallSite; } namespace clang { class ASTContext; class BlockDecl; class CXXDestructorDecl; class CXXForRangeStmt; class CXXTryStmt; class Decl; class LabelDecl; class EnumConstantDecl; class FunctionDecl; class FunctionProtoType; class LabelStmt; class ObjCContainerDecl; class ObjCInterfaceDecl; class ObjCIvarDecl; class ObjCMethodDecl; class ObjCImplementationDecl; class ObjCPropertyImplDecl; class TargetInfo; class TargetCodeGenInfo; class VarDecl; class ObjCForCollectionStmt; class ObjCAtTryStmt; class ObjCAtThrowStmt; class ObjCAtSynchronizedStmt; class ObjCAutoreleasePoolStmt; namespace CodeGen { class CodeGenTypes; class CGFunctionInfo; class CGRecordLayout; class CGBlockInfo; class CGCXXABI; class BlockByrefHelpers; class BlockByrefInfo; class BlockFlags; class BlockFieldFlags; /// The kind of evaluation to perform on values of a particular /// type. Basically, is the code in CGExprScalar, CGExprComplex, or /// CGExprAgg? /// /// TODO: should vectors maybe be split out into their own thing? enum TypeEvaluationKind { TEK_Scalar, TEK_Complex, TEK_Aggregate }; /// CodeGenFunction - This class organizes the per-function state that is used /// while generating LLVM code. class CodeGenFunction : public CodeGenTypeCache { CodeGenFunction(const CodeGenFunction &) = delete; void operator=(const CodeGenFunction &) = delete; friend class CGCXXABI; public: /// A jump destination is an abstract label, branching to which may /// require a jump out through normal cleanups. struct JumpDest { JumpDest() : Block(nullptr), ScopeDepth(), Index(0) {} JumpDest(llvm::BasicBlock *Block, EHScopeStack::stable_iterator Depth, unsigned Index) : Block(Block), ScopeDepth(Depth), Index(Index) {} bool isValid() const { return Block != nullptr; } llvm::BasicBlock *getBlock() const { return Block; } EHScopeStack::stable_iterator getScopeDepth() const { return ScopeDepth; } unsigned getDestIndex() const { return Index; } // This should be used cautiously. void setScopeDepth(EHScopeStack::stable_iterator depth) { ScopeDepth = depth; } private: llvm::BasicBlock *Block; EHScopeStack::stable_iterator ScopeDepth; unsigned Index; }; CodeGenModule &CGM; // Per-module state. const TargetInfo &Target; typedef std::pair ComplexPairTy; LoopInfoStack LoopStack; CGBuilderTy Builder; /// \brief CGBuilder insert helper. This function is called after an /// instruction is created using Builder. void InsertHelper(llvm::Instruction *I, const llvm::Twine &Name, llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt) const; /// CurFuncDecl - Holds the Decl for the current outermost /// non-closure context. const Decl *CurFuncDecl; /// CurCodeDecl - This is the inner-most code context, which includes blocks. const Decl *CurCodeDecl; const CGFunctionInfo *CurFnInfo; QualType FnRetTy; llvm::Function *CurFn; /// CurGD - The GlobalDecl for the current function being compiled. GlobalDecl CurGD; /// PrologueCleanupDepth - The cleanup depth enclosing all the /// cleanups associated with the parameters. EHScopeStack::stable_iterator PrologueCleanupDepth; /// ReturnBlock - Unified return block. JumpDest ReturnBlock; /// ReturnValue - The temporary alloca to hold the return /// value. This is invalid iff the function has no return value. Address ReturnValue; /// AllocaInsertPoint - This is an instruction in the entry block before which /// we prefer to insert allocas. llvm::AssertingVH AllocaInsertPt; /// \brief API for captured statement code generation. class CGCapturedStmtInfo { public: explicit CGCapturedStmtInfo(CapturedRegionKind K = CR_Default) : Kind(K), ThisValue(nullptr), CXXThisFieldDecl(nullptr) {} explicit CGCapturedStmtInfo(const CapturedStmt &S, CapturedRegionKind K = CR_Default) : Kind(K), ThisValue(nullptr), CXXThisFieldDecl(nullptr) { RecordDecl::field_iterator Field = S.getCapturedRecordDecl()->field_begin(); for (CapturedStmt::const_capture_iterator I = S.capture_begin(), E = S.capture_end(); I != E; ++I, ++Field) { if (I->capturesThis()) CXXThisFieldDecl = *Field; else if (I->capturesVariable()) CaptureFields[I->getCapturedVar()] = *Field; } } virtual ~CGCapturedStmtInfo(); CapturedRegionKind getKind() const { return Kind; } virtual void setContextValue(llvm::Value *V) { ThisValue = V; } // \brief Retrieve the value of the context parameter. virtual llvm::Value *getContextValue() const { return ThisValue; } /// \brief Lookup the captured field decl for a variable. virtual const FieldDecl *lookup(const VarDecl *VD) const { return CaptureFields.lookup(VD); } bool isCXXThisExprCaptured() const { return getThisFieldDecl() != nullptr; } virtual FieldDecl *getThisFieldDecl() const { return CXXThisFieldDecl; } static bool classof(const CGCapturedStmtInfo *) { return true; } /// \brief Emit the captured statement body. virtual void EmitBody(CodeGenFunction &CGF, const Stmt *S) { CGF.incrementProfileCounter(S); CGF.EmitStmt(S); } /// \brief Get the name of the capture helper. virtual StringRef getHelperName() const { return "__captured_stmt"; } private: /// \brief The kind of captured statement being generated. CapturedRegionKind Kind; /// \brief Keep the map between VarDecl and FieldDecl. llvm::SmallDenseMap CaptureFields; /// \brief The base address of the captured record, passed in as the first /// argument of the parallel region function. llvm::Value *ThisValue; /// \brief Captured 'this' type. FieldDecl *CXXThisFieldDecl; }; CGCapturedStmtInfo *CapturedStmtInfo; /// \brief RAII for correct setting/restoring of CapturedStmtInfo. class CGCapturedStmtRAII { private: CodeGenFunction &CGF; CGCapturedStmtInfo *PrevCapturedStmtInfo; public: CGCapturedStmtRAII(CodeGenFunction &CGF, CGCapturedStmtInfo *NewCapturedStmtInfo) : CGF(CGF), PrevCapturedStmtInfo(CGF.CapturedStmtInfo) { CGF.CapturedStmtInfo = NewCapturedStmtInfo; } ~CGCapturedStmtRAII() { CGF.CapturedStmtInfo = PrevCapturedStmtInfo; } }; /// \brief Sanitizers enabled for this function. SanitizerSet SanOpts; /// \brief True if CodeGen currently emits code implementing sanitizer checks. bool IsSanitizerScope; /// \brief RAII object to set/unset CodeGenFunction::IsSanitizerScope. class SanitizerScope { CodeGenFunction *CGF; public: SanitizerScope(CodeGenFunction *CGF); ~SanitizerScope(); }; /// In C++, whether we are code generating a thunk. This controls whether we /// should emit cleanups. bool CurFuncIsThunk; /// In ARC, whether we should autorelease the return value. bool AutoreleaseResult; /// Whether we processed a Microsoft-style asm block during CodeGen. These can /// potentially set the return value. bool SawAsmBlock; /// True if the current function is an outlined SEH helper. This can be a /// finally block or filter expression. bool IsOutlinedSEHHelper; const CodeGen::CGBlockInfo *BlockInfo; llvm::Value *BlockPointer; llvm::DenseMap LambdaCaptureFields; FieldDecl *LambdaThisCaptureField; /// \brief A mapping from NRVO variables to the flags used to indicate /// when the NRVO has been applied to this variable. llvm::DenseMap NRVOFlags; EHScopeStack EHStack; llvm::SmallVector LifetimeExtendedCleanupStack; llvm::SmallVector SEHTryEpilogueStack; llvm::Instruction *CurrentFuncletPad = nullptr; /// Header for data within LifetimeExtendedCleanupStack. struct LifetimeExtendedCleanupHeader { /// The size of the following cleanup object. unsigned Size; /// The kind of cleanup to push: a value from the CleanupKind enumeration. CleanupKind Kind; size_t getSize() const { return Size; } CleanupKind getKind() const { return Kind; } }; /// i32s containing the indexes of the cleanup destinations. llvm::AllocaInst *NormalCleanupDest; unsigned NextCleanupDestIndex; /// FirstBlockInfo - The head of a singly-linked-list of block layouts. CGBlockInfo *FirstBlockInfo; /// EHResumeBlock - Unified block containing a call to llvm.eh.resume. llvm::BasicBlock *EHResumeBlock; /// The exception slot. All landing pads write the current exception pointer /// into this alloca. llvm::Value *ExceptionSlot; /// The selector slot. Under the MandatoryCleanup model, all landing pads /// write the current selector value into this alloca. llvm::AllocaInst *EHSelectorSlot; /// A stack of exception code slots. Entering an __except block pushes a slot /// on the stack and leaving pops one. The __exception_code() intrinsic loads /// a value from the top of the stack. SmallVector SEHCodeSlotStack; /// Value returned by __exception_info intrinsic. llvm::Value *SEHInfo = nullptr; /// Emits a landing pad for the current EH stack. llvm::BasicBlock *EmitLandingPad(); llvm::BasicBlock *getInvokeDestImpl(); template typename DominatingValue::saved_type saveValueInCond(T value) { return DominatingValue::save(*this, value); } public: /// ObjCEHValueStack - Stack of Objective-C exception values, used for /// rethrows. SmallVector ObjCEHValueStack; /// A class controlling the emission of a finally block. class FinallyInfo { /// Where the catchall's edge through the cleanup should go. JumpDest RethrowDest; /// A function to call to enter the catch. llvm::Constant *BeginCatchFn; /// An i1 variable indicating whether or not the @finally is /// running for an exception. llvm::AllocaInst *ForEHVar; /// An i8* variable into which the exception pointer to rethrow /// has been saved. llvm::AllocaInst *SavedExnVar; public: void enter(CodeGenFunction &CGF, const Stmt *Finally, llvm::Constant *beginCatchFn, llvm::Constant *endCatchFn, llvm::Constant *rethrowFn); void exit(CodeGenFunction &CGF); }; /// Returns true inside SEH __try blocks. bool isSEHTryScope() const { return !SEHTryEpilogueStack.empty(); } /// Returns true while emitting a cleanuppad. bool isCleanupPadScope() const { return CurrentFuncletPad && isa(CurrentFuncletPad); } /// pushFullExprCleanup - Push a cleanup to be run at the end of the /// current full-expression. Safe against the possibility that /// we're currently inside a conditionally-evaluated expression. template void pushFullExprCleanup(CleanupKind kind, As... A) { // If we're not in a conditional branch, or if none of the // arguments requires saving, then use the unconditional cleanup. if (!isInConditionalBranch()) return EHStack.pushCleanup(kind, A...); // Stash values in a tuple so we can guarantee the order of saves. typedef std::tuple::saved_type...> SavedTuple; SavedTuple Saved{saveValueInCond(A)...}; typedef EHScopeStack::ConditionalCleanup CleanupType; EHStack.pushCleanupTuple(kind, Saved); initFullExprCleanup(); } /// \brief Queue a cleanup to be pushed after finishing the current /// full-expression. template void pushCleanupAfterFullExpr(CleanupKind Kind, As... A) { assert(!isInConditionalBranch() && "can't defer conditional cleanup"); LifetimeExtendedCleanupHeader Header = { sizeof(T), Kind }; size_t OldSize = LifetimeExtendedCleanupStack.size(); LifetimeExtendedCleanupStack.resize( LifetimeExtendedCleanupStack.size() + sizeof(Header) + Header.Size); static_assert(sizeof(Header) % llvm::AlignOf::Alignment == 0, "Cleanup will be allocated on misaligned address"); char *Buffer = &LifetimeExtendedCleanupStack[OldSize]; new (Buffer) LifetimeExtendedCleanupHeader(Header); new (Buffer + sizeof(Header)) T(A...); } /// Set up the last cleaup that was pushed as a conditional /// full-expression cleanup. void initFullExprCleanup(); /// PushDestructorCleanup - Push a cleanup to call the /// complete-object destructor of an object of the given type at the /// given address. Does nothing if T is not a C++ class type with a /// non-trivial destructor. void PushDestructorCleanup(QualType T, Address Addr); /// PushDestructorCleanup - Push a cleanup to call the /// complete-object variant of the given destructor on the object at /// the given address. void PushDestructorCleanup(const CXXDestructorDecl *Dtor, Address Addr); /// PopCleanupBlock - Will pop the cleanup entry on the stack and /// process all branch fixups. void PopCleanupBlock(bool FallThroughIsBranchThrough = false); /// DeactivateCleanupBlock - Deactivates the given cleanup block. /// The block cannot be reactivated. Pops it if it's the top of the /// stack. /// /// \param DominatingIP - An instruction which is known to /// dominate the current IP (if set) and which lies along /// all paths of execution between the current IP and the /// the point at which the cleanup comes into scope. void DeactivateCleanupBlock(EHScopeStack::stable_iterator Cleanup, llvm::Instruction *DominatingIP); /// ActivateCleanupBlock - Activates an initially-inactive cleanup. /// Cannot be used to resurrect a deactivated cleanup. /// /// \param DominatingIP - An instruction which is known to /// dominate the current IP (if set) and which lies along /// all paths of execution between the current IP and the /// the point at which the cleanup comes into scope. void ActivateCleanupBlock(EHScopeStack::stable_iterator Cleanup, llvm::Instruction *DominatingIP); /// \brief Enters a new scope for capturing cleanups, all of which /// will be executed once the scope is exited. class RunCleanupsScope { EHScopeStack::stable_iterator CleanupStackDepth; size_t LifetimeExtendedCleanupStackSize; bool OldDidCallStackSave; protected: bool PerformCleanup; private: RunCleanupsScope(const RunCleanupsScope &) = delete; void operator=(const RunCleanupsScope &) = delete; protected: CodeGenFunction& CGF; public: /// \brief Enter a new cleanup scope. explicit RunCleanupsScope(CodeGenFunction &CGF) : PerformCleanup(true), CGF(CGF) { CleanupStackDepth = CGF.EHStack.stable_begin(); LifetimeExtendedCleanupStackSize = CGF.LifetimeExtendedCleanupStack.size(); OldDidCallStackSave = CGF.DidCallStackSave; CGF.DidCallStackSave = false; } /// \brief Exit this cleanup scope, emitting any accumulated /// cleanups. ~RunCleanupsScope() { if (PerformCleanup) { CGF.DidCallStackSave = OldDidCallStackSave; CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize); } } /// \brief Determine whether this scope requires any cleanups. bool requiresCleanups() const { return CGF.EHStack.stable_begin() != CleanupStackDepth; } /// \brief Force the emission of cleanups now, instead of waiting /// until this object is destroyed. void ForceCleanup() { assert(PerformCleanup && "Already forced cleanup"); CGF.DidCallStackSave = OldDidCallStackSave; CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize); PerformCleanup = false; } }; class LexicalScope : public RunCleanupsScope { SourceRange Range; SmallVector Labels; LexicalScope *ParentScope; LexicalScope(const LexicalScope &) = delete; void operator=(const LexicalScope &) = delete; public: /// \brief Enter a new cleanup scope. explicit LexicalScope(CodeGenFunction &CGF, SourceRange Range) : RunCleanupsScope(CGF), Range(Range), ParentScope(CGF.CurLexicalScope) { CGF.CurLexicalScope = this; if (CGDebugInfo *DI = CGF.getDebugInfo()) DI->EmitLexicalBlockStart(CGF.Builder, Range.getBegin()); } void addLabel(const LabelDecl *label) { assert(PerformCleanup && "adding label to dead scope?"); Labels.push_back(label); } /// \brief Exit this cleanup scope, emitting any accumulated /// cleanups. ~LexicalScope() { if (CGDebugInfo *DI = CGF.getDebugInfo()) DI->EmitLexicalBlockEnd(CGF.Builder, Range.getEnd()); // If we should perform a cleanup, force them now. Note that // this ends the cleanup scope before rescoping any labels. if (PerformCleanup) { ApplyDebugLocation DL(CGF, Range.getEnd()); ForceCleanup(); } } /// \brief Force the emission of cleanups now, instead of waiting /// until this object is destroyed. void ForceCleanup() { CGF.CurLexicalScope = ParentScope; RunCleanupsScope::ForceCleanup(); if (!Labels.empty()) rescopeLabels(); } void rescopeLabels(); }; typedef llvm::DenseMap DeclMapTy; /// \brief The scope used to remap some variables as private in the OpenMP /// loop body (or other captured region emitted without outlining), and to /// restore old vars back on exit. class OMPPrivateScope : public RunCleanupsScope { DeclMapTy SavedLocals; DeclMapTy SavedPrivates; private: OMPPrivateScope(const OMPPrivateScope &) = delete; void operator=(const OMPPrivateScope &) = delete; public: /// \brief Enter a new OpenMP private scope. explicit OMPPrivateScope(CodeGenFunction &CGF) : RunCleanupsScope(CGF) {} /// \brief Registers \a LocalVD variable as a private and apply \a /// PrivateGen function for it to generate corresponding private variable. /// \a PrivateGen returns an address of the generated private variable. /// \return true if the variable is registered as private, false if it has /// been privatized already. bool addPrivate(const VarDecl *LocalVD, llvm::function_ref PrivateGen) { assert(PerformCleanup && "adding private to dead scope"); // Only save it once. if (SavedLocals.count(LocalVD)) return false; // Copy the existing local entry to SavedLocals. auto it = CGF.LocalDeclMap.find(LocalVD); if (it != CGF.LocalDeclMap.end()) { SavedLocals.insert({LocalVD, it->second}); } else { SavedLocals.insert({LocalVD, Address::invalid()}); } // Generate the private entry. Address Addr = PrivateGen(); QualType VarTy = LocalVD->getType(); if (VarTy->isReferenceType()) { Address Temp = CGF.CreateMemTemp(VarTy); CGF.Builder.CreateStore(Addr.getPointer(), Temp); Addr = Temp; } SavedPrivates.insert({LocalVD, Addr}); return true; } /// \brief Privatizes local variables previously registered as private. /// Registration is separate from the actual privatization to allow /// initializers use values of the original variables, not the private one. /// This is important, for example, if the private variable is a class /// variable initialized by a constructor that references other private /// variables. But at initialization original variables must be used, not /// private copies. /// \return true if at least one variable was privatized, false otherwise. bool Privatize() { copyInto(SavedPrivates, CGF.LocalDeclMap); SavedPrivates.clear(); return !SavedLocals.empty(); } void ForceCleanup() { RunCleanupsScope::ForceCleanup(); copyInto(SavedLocals, CGF.LocalDeclMap); SavedLocals.clear(); } /// \brief Exit scope - all the mapped variables are restored. ~OMPPrivateScope() { if (PerformCleanup) ForceCleanup(); } private: /// Copy all the entries in the source map over the corresponding /// entries in the destination, which must exist. static void copyInto(const DeclMapTy &src, DeclMapTy &dest) { for (auto &pair : src) { if (!pair.second.isValid()) { dest.erase(pair.first); continue; } auto it = dest.find(pair.first); if (it != dest.end()) { it->second = pair.second; } else { dest.insert(pair); } } } }; /// \brief Takes the old cleanup stack size and emits the cleanup blocks /// that have been added. void PopCleanupBlocks(EHScopeStack::stable_iterator OldCleanupStackSize); /// \brief Takes the old cleanup stack size and emits the cleanup blocks /// that have been added, then adds all lifetime-extended cleanups from /// the given position to the stack. void PopCleanupBlocks(EHScopeStack::stable_iterator OldCleanupStackSize, size_t OldLifetimeExtendedStackSize); void ResolveBranchFixups(llvm::BasicBlock *Target); /// The given basic block lies in the current EH scope, but may be a /// target of a potentially scope-crossing jump; get a stable handle /// to which we can perform this jump later. JumpDest getJumpDestInCurrentScope(llvm::BasicBlock *Target) { return JumpDest(Target, EHStack.getInnermostNormalCleanup(), NextCleanupDestIndex++); } /// The given basic block lies in the current EH scope, but may be a /// target of a potentially scope-crossing jump; get a stable handle /// to which we can perform this jump later. JumpDest getJumpDestInCurrentScope(StringRef Name = StringRef()) { return getJumpDestInCurrentScope(createBasicBlock(Name)); } /// EmitBranchThroughCleanup - Emit a branch from the current insert /// block through the normal cleanup handling code (if any) and then /// on to \arg Dest. void EmitBranchThroughCleanup(JumpDest Dest); /// isObviouslyBranchWithoutCleanups - Return true if a branch to the /// specified destination obviously has no cleanups to run. 'false' is always /// a conservatively correct answer for this method. bool isObviouslyBranchWithoutCleanups(JumpDest Dest) const; /// popCatchScope - Pops the catch scope at the top of the EHScope /// stack, emitting any required code (other than the catch handlers /// themselves). void popCatchScope(); llvm::BasicBlock *getEHResumeBlock(bool isCleanup); llvm::BasicBlock *getEHDispatchBlock(EHScopeStack::stable_iterator scope); llvm::BasicBlock *getMSVCDispatchBlock(EHScopeStack::stable_iterator scope); /// An object to manage conditionally-evaluated expressions. class ConditionalEvaluation { llvm::BasicBlock *StartBB; public: ConditionalEvaluation(CodeGenFunction &CGF) : StartBB(CGF.Builder.GetInsertBlock()) {} void begin(CodeGenFunction &CGF) { assert(CGF.OutermostConditional != this); if (!CGF.OutermostConditional) CGF.OutermostConditional = this; } void end(CodeGenFunction &CGF) { assert(CGF.OutermostConditional != nullptr); if (CGF.OutermostConditional == this) CGF.OutermostConditional = nullptr; } /// Returns a block which will be executed prior to each /// evaluation of the conditional code. llvm::BasicBlock *getStartingBlock() const { return StartBB; } }; /// isInConditionalBranch - Return true if we're currently emitting /// one branch or the other of a conditional expression. bool isInConditionalBranch() const { return OutermostConditional != nullptr; } void setBeforeOutermostConditional(llvm::Value *value, Address addr) { assert(isInConditionalBranch()); llvm::BasicBlock *block = OutermostConditional->getStartingBlock(); auto store = new llvm::StoreInst(value, addr.getPointer(), &block->back()); store->setAlignment(addr.getAlignment().getQuantity()); } /// An RAII object to record that we're evaluating a statement /// expression. class StmtExprEvaluation { CodeGenFunction &CGF; /// We have to save the outermost conditional: cleanups in a /// statement expression aren't conditional just because the /// StmtExpr is. ConditionalEvaluation *SavedOutermostConditional; public: StmtExprEvaluation(CodeGenFunction &CGF) : CGF(CGF), SavedOutermostConditional(CGF.OutermostConditional) { CGF.OutermostConditional = nullptr; } ~StmtExprEvaluation() { CGF.OutermostConditional = SavedOutermostConditional; CGF.EnsureInsertPoint(); } }; /// An object which temporarily prevents a value from being /// destroyed by aggressive peephole optimizations that assume that /// all uses of a value have been realized in the IR. class PeepholeProtection { llvm::Instruction *Inst; friend class CodeGenFunction; public: PeepholeProtection() : Inst(nullptr) {} }; /// A non-RAII class containing all the information about a bound /// opaque value. OpaqueValueMapping, below, is a RAII wrapper for /// this which makes individual mappings very simple; using this /// class directly is useful when you have a variable number of /// opaque values or don't want the RAII functionality for some /// reason. class OpaqueValueMappingData { const OpaqueValueExpr *OpaqueValue; bool BoundLValue; CodeGenFunction::PeepholeProtection Protection; OpaqueValueMappingData(const OpaqueValueExpr *ov, bool boundLValue) : OpaqueValue(ov), BoundLValue(boundLValue) {} public: OpaqueValueMappingData() : OpaqueValue(nullptr) {} static bool shouldBindAsLValue(const Expr *expr) { // gl-values should be bound as l-values for obvious reasons. // Records should be bound as l-values because IR generation // always keeps them in memory. Expressions of function type // act exactly like l-values but are formally required to be // r-values in C. return expr->isGLValue() || expr->getType()->isFunctionType() || hasAggregateEvaluationKind(expr->getType()); } static OpaqueValueMappingData bind(CodeGenFunction &CGF, const OpaqueValueExpr *ov, const Expr *e) { if (shouldBindAsLValue(ov)) return bind(CGF, ov, CGF.EmitLValue(e)); return bind(CGF, ov, CGF.EmitAnyExpr(e)); } static OpaqueValueMappingData bind(CodeGenFunction &CGF, const OpaqueValueExpr *ov, const LValue &lv) { assert(shouldBindAsLValue(ov)); CGF.OpaqueLValues.insert(std::make_pair(ov, lv)); return OpaqueValueMappingData(ov, true); } static OpaqueValueMappingData bind(CodeGenFunction &CGF, const OpaqueValueExpr *ov, const RValue &rv) { assert(!shouldBindAsLValue(ov)); CGF.OpaqueRValues.insert(std::make_pair(ov, rv)); OpaqueValueMappingData data(ov, false); // Work around an extremely aggressive peephole optimization in // EmitScalarConversion which assumes that all other uses of a // value are extant. data.Protection = CGF.protectFromPeepholes(rv); return data; } bool isValid() const { return OpaqueValue != nullptr; } void clear() { OpaqueValue = nullptr; } void unbind(CodeGenFunction &CGF) { assert(OpaqueValue && "no data to unbind!"); if (BoundLValue) { CGF.OpaqueLValues.erase(OpaqueValue); } else { CGF.OpaqueRValues.erase(OpaqueValue); CGF.unprotectFromPeepholes(Protection); } } }; /// An RAII object to set (and then clear) a mapping for an OpaqueValueExpr. class OpaqueValueMapping { CodeGenFunction &CGF; OpaqueValueMappingData Data; public: static bool shouldBindAsLValue(const Expr *expr) { return OpaqueValueMappingData::shouldBindAsLValue(expr); } /// Build the opaque value mapping for the given conditional /// operator if it's the GNU ?: extension. This is a common /// enough pattern that the convenience operator is really /// helpful. /// OpaqueValueMapping(CodeGenFunction &CGF, const AbstractConditionalOperator *op) : CGF(CGF) { if (isa(op)) // Leave Data empty. return; const BinaryConditionalOperator *e = cast(op); Data = OpaqueValueMappingData::bind(CGF, e->getOpaqueValue(), e->getCommon()); } OpaqueValueMapping(CodeGenFunction &CGF, const OpaqueValueExpr *opaqueValue, LValue lvalue) : CGF(CGF), Data(OpaqueValueMappingData::bind(CGF, opaqueValue, lvalue)) { } OpaqueValueMapping(CodeGenFunction &CGF, const OpaqueValueExpr *opaqueValue, RValue rvalue) : CGF(CGF), Data(OpaqueValueMappingData::bind(CGF, opaqueValue, rvalue)) { } void pop() { Data.unbind(CGF); Data.clear(); } ~OpaqueValueMapping() { if (Data.isValid()) Data.unbind(CGF); } }; private: CGDebugInfo *DebugInfo; bool DisableDebugInfo; /// DidCallStackSave - Whether llvm.stacksave has been called. Used to avoid /// calling llvm.stacksave for multiple VLAs in the same scope. bool DidCallStackSave; /// IndirectBranch - The first time an indirect goto is seen we create a block /// with an indirect branch. Every time we see the address of a label taken, /// we add the label to the indirect goto. Every subsequent indirect goto is /// codegen'd as a jump to the IndirectBranch's basic block. llvm::IndirectBrInst *IndirectBranch; /// LocalDeclMap - This keeps track of the LLVM allocas or globals for local C /// decls. DeclMapTy LocalDeclMap; /// SizeArguments - If a ParmVarDecl had the pass_object_size attribute, this /// will contain a mapping from said ParmVarDecl to its implicit "object_size" /// parameter. llvm::SmallDenseMap SizeArguments; /// Track escaped local variables with auto storage. Used during SEH /// outlining to produce a call to llvm.localescape. llvm::DenseMap EscapedLocals; /// LabelMap - This keeps track of the LLVM basic block for each C label. llvm::DenseMap LabelMap; // BreakContinueStack - This keeps track of where break and continue // statements should jump to. struct BreakContinue { BreakContinue(JumpDest Break, JumpDest Continue) : BreakBlock(Break), ContinueBlock(Continue) {} JumpDest BreakBlock; JumpDest ContinueBlock; }; SmallVector BreakContinueStack; CodeGenPGO PGO; /// Calculate branch weights appropriate for PGO data llvm::MDNode *createProfileWeights(uint64_t TrueCount, uint64_t FalseCount); llvm::MDNode *createProfileWeights(ArrayRef Weights); llvm::MDNode *createProfileWeightsForLoop(const Stmt *Cond, uint64_t LoopCount); public: /// Increment the profiler's counter for the given statement. void incrementProfileCounter(const Stmt *S) { if (CGM.getCodeGenOpts().ProfileInstrGenerate) PGO.emitCounterIncrement(Builder, S); PGO.setCurrentStmt(S); } /// Get the profiler's count for the given statement. uint64_t getProfileCount(const Stmt *S) { Optional Count = PGO.getStmtCount(S); if (!Count.hasValue()) return 0; return *Count; } /// Set the profiler's current count. void setCurrentProfileCount(uint64_t Count) { PGO.setCurrentRegionCount(Count); } /// Get the profiler's current count. This is generally the count for the most /// recently incremented counter. uint64_t getCurrentProfileCount() { return PGO.getCurrentRegionCount(); } private: /// SwitchInsn - This is nearest current switch instruction. It is null if /// current context is not in a switch. llvm::SwitchInst *SwitchInsn; /// The branch weights of SwitchInsn when doing instrumentation based PGO. SmallVector *SwitchWeights; /// CaseRangeBlock - This block holds if condition check for last case /// statement range in current switch instruction. llvm::BasicBlock *CaseRangeBlock; /// OpaqueLValues - Keeps track of the current set of opaque value /// expressions. llvm::DenseMap OpaqueLValues; llvm::DenseMap OpaqueRValues; // VLASizeMap - This keeps track of the associated size for each VLA type. // We track this by the size expression rather than the type itself because // in certain situations, like a const qualifier applied to an VLA typedef, // multiple VLA types can share the same size expression. // FIXME: Maybe this could be a stack of maps that is pushed/popped as we // enter/leave scopes. llvm::DenseMap VLASizeMap; /// A block containing a single 'unreachable' instruction. Created /// lazily by getUnreachableBlock(). llvm::BasicBlock *UnreachableBlock; /// Counts of the number return expressions in the function. unsigned NumReturnExprs; /// Count the number of simple (constant) return expressions in the function. unsigned NumSimpleReturnExprs; /// The last regular (non-return) debug location (breakpoint) in the function. SourceLocation LastStopPoint; public: /// A scope within which we are constructing the fields of an object which /// might use a CXXDefaultInitExpr. This stashes away a 'this' value to use /// if we need to evaluate a CXXDefaultInitExpr within the evaluation. class FieldConstructionScope { public: FieldConstructionScope(CodeGenFunction &CGF, Address This) : CGF(CGF), OldCXXDefaultInitExprThis(CGF.CXXDefaultInitExprThis) { CGF.CXXDefaultInitExprThis = This; } ~FieldConstructionScope() { CGF.CXXDefaultInitExprThis = OldCXXDefaultInitExprThis; } private: CodeGenFunction &CGF; Address OldCXXDefaultInitExprThis; }; /// The scope of a CXXDefaultInitExpr. Within this scope, the value of 'this' /// is overridden to be the object under construction. class CXXDefaultInitExprScope { public: CXXDefaultInitExprScope(CodeGenFunction &CGF) : CGF(CGF), OldCXXThisValue(CGF.CXXThisValue), OldCXXThisAlignment(CGF.CXXThisAlignment) { CGF.CXXThisValue = CGF.CXXDefaultInitExprThis.getPointer(); CGF.CXXThisAlignment = CGF.CXXDefaultInitExprThis.getAlignment(); } ~CXXDefaultInitExprScope() { CGF.CXXThisValue = OldCXXThisValue; CGF.CXXThisAlignment = OldCXXThisAlignment; } public: CodeGenFunction &CGF; llvm::Value *OldCXXThisValue; CharUnits OldCXXThisAlignment; }; private: /// CXXThisDecl - When generating code for a C++ member function, /// this will hold the implicit 'this' declaration. ImplicitParamDecl *CXXABIThisDecl; llvm::Value *CXXABIThisValue; llvm::Value *CXXThisValue; CharUnits CXXABIThisAlignment; CharUnits CXXThisAlignment; /// The value of 'this' to use when evaluating CXXDefaultInitExprs within /// this expression. Address CXXDefaultInitExprThis = Address::invalid(); /// CXXStructorImplicitParamDecl - When generating code for a constructor or /// destructor, this will hold the implicit argument (e.g. VTT). ImplicitParamDecl *CXXStructorImplicitParamDecl; llvm::Value *CXXStructorImplicitParamValue; /// OutermostConditional - Points to the outermost active /// conditional control. This is used so that we know if a /// temporary should be destroyed conditionally. ConditionalEvaluation *OutermostConditional; /// The current lexical scope. LexicalScope *CurLexicalScope; /// The current source location that should be used for exception /// handling code. SourceLocation CurEHLocation; /// BlockByrefInfos - For each __block variable, contains /// information about the layout of the variable. llvm::DenseMap BlockByrefInfos; llvm::BasicBlock *TerminateLandingPad; llvm::BasicBlock *TerminateHandler; llvm::BasicBlock *TrapBB; /// Add a kernel metadata node to the named metadata node 'opencl.kernels'. /// In the kernel metadata node, reference the kernel function and metadata /// nodes for its optional attribute qualifiers (OpenCL 1.1 6.7.2): /// - A node for the vec_type_hint() qualifier contains string /// "vec_type_hint", an undefined value of the data type, /// and a Boolean that is true if the is integer and signed. /// - A node for the work_group_size_hint(X,Y,Z) qualifier contains string /// "work_group_size_hint", and three 32-bit integers X, Y and Z. /// - A node for the reqd_work_group_size(X,Y,Z) qualifier contains string /// "reqd_work_group_size", and three 32-bit integers X, Y and Z. void EmitOpenCLKernelMetadata(const FunctionDecl *FD, llvm::Function *Fn); public: CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext=false); ~CodeGenFunction(); CodeGenTypes &getTypes() const { return CGM.getTypes(); } ASTContext &getContext() const { return CGM.getContext(); } CGDebugInfo *getDebugInfo() { if (DisableDebugInfo) return nullptr; return DebugInfo; } void disableDebugInfo() { DisableDebugInfo = true; } void enableDebugInfo() { DisableDebugInfo = false; } bool shouldUseFusedARCCalls() { return CGM.getCodeGenOpts().OptimizationLevel == 0; } const LangOptions &getLangOpts() const { return CGM.getLangOpts(); } /// Returns a pointer to the function's exception object and selector slot, /// which is assigned in every landing pad. Address getExceptionSlot(); Address getEHSelectorSlot(); /// Returns the contents of the function's exception object and selector /// slots. llvm::Value *getExceptionFromSlot(); llvm::Value *getSelectorFromSlot(); Address getNormalCleanupDestSlot(); llvm::BasicBlock *getUnreachableBlock() { if (!UnreachableBlock) { UnreachableBlock = createBasicBlock("unreachable"); new llvm::UnreachableInst(getLLVMContext(), UnreachableBlock); } return UnreachableBlock; } llvm::BasicBlock *getInvokeDest() { if (!EHStack.requiresLandingPad()) return nullptr; return getInvokeDestImpl(); } bool currentFunctionUsesSEHTry() const { const auto *FD = dyn_cast_or_null(CurCodeDecl); return FD && FD->usesSEHTry(); } const TargetInfo &getTarget() const { return Target; } llvm::LLVMContext &getLLVMContext() { return CGM.getLLVMContext(); } //===--------------------------------------------------------------------===// // Cleanups //===--------------------------------------------------------------------===// typedef void Destroyer(CodeGenFunction &CGF, Address addr, QualType ty); void pushIrregularPartialArrayCleanup(llvm::Value *arrayBegin, Address arrayEndPointer, QualType elementType, CharUnits elementAlignment, Destroyer *destroyer); void pushRegularPartialArrayCleanup(llvm::Value *arrayBegin, llvm::Value *arrayEnd, QualType elementType, CharUnits elementAlignment, Destroyer *destroyer); void pushDestroy(QualType::DestructionKind dtorKind, Address addr, QualType type); void pushEHDestroy(QualType::DestructionKind dtorKind, Address addr, QualType type); void pushDestroy(CleanupKind kind, Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray); void pushLifetimeExtendedDestroy(CleanupKind kind, Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray); void pushCallObjectDeleteCleanup(const FunctionDecl *OperatorDelete, llvm::Value *CompletePtr, QualType ElementType); void pushStackRestore(CleanupKind kind, Address SPMem); void emitDestroy(Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray); llvm::Function *generateDestroyHelper(Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray, const VarDecl *VD); void emitArrayDestroy(llvm::Value *begin, llvm::Value *end, QualType elementType, CharUnits elementAlign, Destroyer *destroyer, bool checkZeroLength, bool useEHCleanup); Destroyer *getDestroyer(QualType::DestructionKind destructionKind); /// Determines whether an EH cleanup is required to destroy a type /// with the given destruction kind. bool needsEHCleanup(QualType::DestructionKind kind) { switch (kind) { case QualType::DK_none: return false; case QualType::DK_cxx_destructor: case QualType::DK_objc_weak_lifetime: return getLangOpts().Exceptions; case QualType::DK_objc_strong_lifetime: return getLangOpts().Exceptions && CGM.getCodeGenOpts().ObjCAutoRefCountExceptions; } llvm_unreachable("bad destruction kind"); } CleanupKind getCleanupKind(QualType::DestructionKind kind) { return (needsEHCleanup(kind) ? NormalAndEHCleanup : NormalCleanup); } //===--------------------------------------------------------------------===// // Objective-C //===--------------------------------------------------------------------===// void GenerateObjCMethod(const ObjCMethodDecl *OMD); void StartObjCMethod(const ObjCMethodDecl *MD, const ObjCContainerDecl *CD); /// GenerateObjCGetter - Synthesize an Objective-C property getter function. void GenerateObjCGetter(ObjCImplementationDecl *IMP, const ObjCPropertyImplDecl *PID); void generateObjCGetterBody(const ObjCImplementationDecl *classImpl, const ObjCPropertyImplDecl *propImpl, const ObjCMethodDecl *GetterMothodDecl, llvm::Constant *AtomicHelperFn); void GenerateObjCCtorDtorMethod(ObjCImplementationDecl *IMP, ObjCMethodDecl *MD, bool ctor); /// GenerateObjCSetter - Synthesize an Objective-C property setter function /// for the given property. void GenerateObjCSetter(ObjCImplementationDecl *IMP, const ObjCPropertyImplDecl *PID); void generateObjCSetterBody(const ObjCImplementationDecl *classImpl, const ObjCPropertyImplDecl *propImpl, llvm::Constant *AtomicHelperFn); //===--------------------------------------------------------------------===// // Block Bits //===--------------------------------------------------------------------===// llvm::Value *EmitBlockLiteral(const BlockExpr *); llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info); static void destroyBlockInfos(CGBlockInfo *info); llvm::Function *GenerateBlockFunction(GlobalDecl GD, const CGBlockInfo &Info, const DeclMapTy &ldm, bool IsLambdaConversionToBlock); llvm::Constant *GenerateCopyHelperFunction(const CGBlockInfo &blockInfo); llvm::Constant *GenerateDestroyHelperFunction(const CGBlockInfo &blockInfo); llvm::Constant *GenerateObjCAtomicSetterCopyHelperFunction( const ObjCPropertyImplDecl *PID); llvm::Constant *GenerateObjCAtomicGetterCopyHelperFunction( const ObjCPropertyImplDecl *PID); llvm::Value *EmitBlockCopyAndAutorelease(llvm::Value *Block, QualType Ty); void BuildBlockRelease(llvm::Value *DeclPtr, BlockFieldFlags flags); class AutoVarEmission; void emitByrefStructureInit(const AutoVarEmission &emission); void enterByrefCleanup(const AutoVarEmission &emission); void setBlockContextParameter(const ImplicitParamDecl *D, unsigned argNum, llvm::Value *ptr); Address LoadBlockStruct(); Address GetAddrOfBlockDecl(const VarDecl *var, bool ByRef); /// BuildBlockByrefAddress - Computes the location of the /// data in a variable which is declared as __block. Address emitBlockByrefAddress(Address baseAddr, const VarDecl *V, bool followForward = true); Address emitBlockByrefAddress(Address baseAddr, const BlockByrefInfo &info, bool followForward, const llvm::Twine &name); const BlockByrefInfo &getBlockByrefInfo(const VarDecl *var); void GenerateCode(GlobalDecl GD, llvm::Function *Fn, const CGFunctionInfo &FnInfo); /// \brief Emit code for the start of a function. /// \param Loc The location to be associated with the function. /// \param StartLoc The location of the function body. void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc = SourceLocation(), SourceLocation StartLoc = SourceLocation()); void EmitConstructorBody(FunctionArgList &Args); void EmitDestructorBody(FunctionArgList &Args); void emitImplicitAssignmentOperatorBody(FunctionArgList &Args); void EmitFunctionBody(FunctionArgList &Args, const Stmt *Body); void EmitBlockWithFallThrough(llvm::BasicBlock *BB, const Stmt *S); void EmitForwardingCallToLambda(const CXXMethodDecl *LambdaCallOperator, CallArgList &CallArgs); void EmitLambdaToBlockPointerBody(FunctionArgList &Args); void EmitLambdaBlockInvokeBody(); void EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD); void EmitLambdaStaticInvokeFunction(const CXXMethodDecl *MD); void EmitAsanPrologueOrEpilogue(bool Prologue); /// \brief Emit the unified return block, trying to avoid its emission when /// possible. /// \return The debug location of the user written return statement if the /// return block is is avoided. llvm::DebugLoc EmitReturnBlock(); /// FinishFunction - Complete IR generation of the current function. It is /// legal to call this function even if there is no current insertion point. void FinishFunction(SourceLocation EndLoc=SourceLocation()); void StartThunk(llvm::Function *Fn, GlobalDecl GD, const CGFunctionInfo &FnInfo); void EmitCallAndReturnForThunk(llvm::Value *Callee, const ThunkInfo *Thunk); void FinishThunk(); /// Emit a musttail call for a thunk with a potentially adjusted this pointer. void EmitMustTailThunk(const CXXMethodDecl *MD, llvm::Value *AdjustedThisPtr, llvm::Value *Callee); /// Generate a thunk for the given method. void generateThunk(llvm::Function *Fn, const CGFunctionInfo &FnInfo, GlobalDecl GD, const ThunkInfo &Thunk); llvm::Function *GenerateVarArgsThunk(llvm::Function *Fn, const CGFunctionInfo &FnInfo, GlobalDecl GD, const ThunkInfo &Thunk); void EmitCtorPrologue(const CXXConstructorDecl *CD, CXXCtorType Type, FunctionArgList &Args); void EmitInitializerForField(FieldDecl *Field, LValue LHS, Expr *Init, ArrayRef ArrayIndexes); /// Struct with all informations about dynamic [sub]class needed to set vptr. struct VPtr { BaseSubobject Base; const CXXRecordDecl *NearestVBase; CharUnits OffsetFromNearestVBase; const CXXRecordDecl *VTableClass; }; /// Initialize the vtable pointer of the given subobject. void InitializeVTablePointer(const VPtr &vptr); typedef llvm::SmallVector VPtrsVector; typedef llvm::SmallPtrSet VisitedVirtualBasesSetTy; VPtrsVector getVTablePointers(const CXXRecordDecl *VTableClass); void getVTablePointers(BaseSubobject Base, const CXXRecordDecl *NearestVBase, CharUnits OffsetFromNearestVBase, bool BaseIsNonVirtualPrimaryBase, const CXXRecordDecl *VTableClass, VisitedVirtualBasesSetTy &VBases, VPtrsVector &vptrs); void InitializeVTablePointers(const CXXRecordDecl *ClassDecl); /// GetVTablePtr - Return the Value of the vtable pointer member pointed /// to by This. llvm::Value *GetVTablePtr(Address This, llvm::Type *VTableTy, const CXXRecordDecl *VTableClass); enum CFITypeCheckKind { CFITCK_VCall, CFITCK_NVCall, CFITCK_DerivedCast, CFITCK_UnrelatedCast, }; /// \brief Derived is the presumed address of an object of type T after a /// cast. If T is a polymorphic class type, emit a check that the virtual /// table for Derived belongs to a class derived from T. void EmitVTablePtrCheckForCast(QualType T, llvm::Value *Derived, bool MayBeNull, CFITypeCheckKind TCK, SourceLocation Loc); /// EmitVTablePtrCheckForCall - Virtual method MD is being called via VTable. /// If vptr CFI is enabled, emit a check that VTable is valid. void EmitVTablePtrCheckForCall(const CXXMethodDecl *MD, llvm::Value *VTable, CFITypeCheckKind TCK, SourceLocation Loc); /// EmitVTablePtrCheck - Emit a check that VTable is a valid virtual table for /// RD using llvm.bitset.test. void EmitVTablePtrCheck(const CXXRecordDecl *RD, llvm::Value *VTable, CFITypeCheckKind TCK, SourceLocation Loc); /// CanDevirtualizeMemberFunctionCalls - Checks whether virtual calls on given /// expr can be devirtualized. bool CanDevirtualizeMemberFunctionCall(const Expr *Base, const CXXMethodDecl *MD); /// EnterDtorCleanups - Enter the cleanups necessary to complete the /// given phase of destruction for a destructor. The end result /// should call destructors on members and base classes in reverse /// order of their construction. void EnterDtorCleanups(const CXXDestructorDecl *Dtor, CXXDtorType Type); /// ShouldInstrumentFunction - Return true if the current function should be /// instrumented with __cyg_profile_func_* calls bool ShouldInstrumentFunction(); /// EmitFunctionInstrumentation - Emit LLVM code to call the specified /// instrumentation function with the current function and the call site, if /// function instrumentation is enabled. void EmitFunctionInstrumentation(const char *Fn); /// EmitMCountInstrumentation - Emit call to .mcount. void EmitMCountInstrumentation(); /// EmitFunctionProlog - Emit the target specific LLVM code to load the /// arguments for the given function. This is also responsible for naming the /// LLVM function arguments. void EmitFunctionProlog(const CGFunctionInfo &FI, llvm::Function *Fn, const FunctionArgList &Args); /// EmitFunctionEpilog - Emit the target specific LLVM code to return the /// given temporary. void EmitFunctionEpilog(const CGFunctionInfo &FI, bool EmitRetDbgLoc, SourceLocation EndLoc); /// EmitStartEHSpec - Emit the start of the exception spec. void EmitStartEHSpec(const Decl *D); /// EmitEndEHSpec - Emit the end of the exception spec. void EmitEndEHSpec(const Decl *D); /// getTerminateLandingPad - Return a landing pad that just calls terminate. llvm::BasicBlock *getTerminateLandingPad(); /// getTerminateHandler - Return a handler (not a landing pad, just /// a catch handler) that just calls terminate. This is used when /// a terminate scope encloses a try. llvm::BasicBlock *getTerminateHandler(); llvm::Type *ConvertTypeForMem(QualType T); llvm::Type *ConvertType(QualType T); llvm::Type *ConvertType(const TypeDecl *T) { return ConvertType(getContext().getTypeDeclType(T)); } /// LoadObjCSelf - Load the value of self. This function is only valid while /// generating code for an Objective-C method. llvm::Value *LoadObjCSelf(); /// TypeOfSelfObject - Return type of object that this self represents. QualType TypeOfSelfObject(); /// hasAggregateLLVMType - Return true if the specified AST type will map into /// an aggregate LLVM type or is void. static TypeEvaluationKind getEvaluationKind(QualType T); static bool hasScalarEvaluationKind(QualType T) { return getEvaluationKind(T) == TEK_Scalar; } static bool hasAggregateEvaluationKind(QualType T) { return getEvaluationKind(T) == TEK_Aggregate; } /// createBasicBlock - Create an LLVM basic block. llvm::BasicBlock *createBasicBlock(const Twine &name = "", llvm::Function *parent = nullptr, llvm::BasicBlock *before = nullptr) { #ifdef NDEBUG return llvm::BasicBlock::Create(getLLVMContext(), "", parent, before); #else return llvm::BasicBlock::Create(getLLVMContext(), name, parent, before); #endif } /// getBasicBlockForLabel - Return the LLVM basicblock that the specified /// label maps to. JumpDest getJumpDestForLabel(const LabelDecl *S); /// SimplifyForwardingBlocks - If the given basic block is only a branch to /// another basic block, simplify it. This assumes that no other code could /// potentially reference the basic block. void SimplifyForwardingBlocks(llvm::BasicBlock *BB); /// EmitBlock - Emit the given block \arg BB and set it as the insert point, /// adding a fall-through branch from the current insert block if /// necessary. It is legal to call this function even if there is no current /// insertion point. /// /// IsFinished - If true, indicates that the caller has finished emitting /// branches to the given block and does not expect to emit code into it. This /// means the block can be ignored if it is unreachable. void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false); /// EmitBlockAfterUses - Emit the given block somewhere hopefully /// near its uses, and leave the insertion point in it. void EmitBlockAfterUses(llvm::BasicBlock *BB); /// EmitBranch - Emit a branch to the specified basic block from the current /// insert block, taking care to avoid creation of branches from dummy /// blocks. It is legal to call this function even if there is no current /// insertion point. /// /// This function clears the current insertion point. The caller should follow /// calls to this function with calls to Emit*Block prior to generation new /// code. void EmitBranch(llvm::BasicBlock *Block); /// HaveInsertPoint - True if an insertion point is defined. If not, this /// indicates that the current code being emitted is unreachable. bool HaveInsertPoint() const { return Builder.GetInsertBlock() != nullptr; } /// EnsureInsertPoint - Ensure that an insertion point is defined so that /// emitted IR has a place to go. Note that by definition, if this function /// creates a block then that block is unreachable; callers may do better to /// detect when no insertion point is defined and simply skip IR generation. void EnsureInsertPoint() { if (!HaveInsertPoint()) EmitBlock(createBasicBlock()); } /// ErrorUnsupported - Print out an error that codegen doesn't support the /// specified stmt yet. void ErrorUnsupported(const Stmt *S, const char *Type); //===--------------------------------------------------------------------===// // Helpers //===--------------------------------------------------------------------===// LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource AlignSource = AlignmentSource::Type) { return LValue::MakeAddr(Addr, T, getContext(), AlignSource, CGM.getTBAAInfo(T)); } LValue MakeAddrLValue(llvm::Value *V, QualType T, CharUnits Alignment, AlignmentSource AlignSource = AlignmentSource::Type) { return LValue::MakeAddr(Address(V, Alignment), T, getContext(), AlignSource, CGM.getTBAAInfo(T)); } LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T); LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T); CharUnits getNaturalTypeAlignment(QualType T, AlignmentSource *Source = nullptr, bool forPointeeType = false); CharUnits getNaturalPointeeTypeAlignment(QualType T, AlignmentSource *Source = nullptr); Address EmitLoadOfReference(Address Ref, const ReferenceType *RefTy, AlignmentSource *Source = nullptr); LValue EmitLoadOfReferenceLValue(Address Ref, const ReferenceType *RefTy); /// CreateTempAlloca - This creates a alloca and inserts it into the entry /// block. The caller is responsible for setting an appropriate alignment on /// the alloca. llvm::AllocaInst *CreateTempAlloca(llvm::Type *Ty, const Twine &Name = "tmp"); Address CreateTempAlloca(llvm::Type *Ty, CharUnits align, const Twine &Name = "tmp"); /// CreateDefaultAlignedTempAlloca - This creates an alloca with the /// default ABI alignment of the given LLVM type. /// /// IMPORTANT NOTE: This is *not* generally the right alignment for /// any given AST type that happens to have been lowered to the /// given IR type. This should only ever be used for function-local, /// IR-driven manipulations like saving and restoring a value. Do /// not hand this address off to arbitrary IRGen routines, and especially /// do not pass it as an argument to a function that might expect a /// properly ABI-aligned value. Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name = "tmp"); /// InitTempAlloca - Provide an initial value for the given alloca which /// will be observable at all locations in the function. /// /// The address should be something that was returned from one of /// the CreateTempAlloca or CreateMemTemp routines, and the /// initializer must be valid in the entry block (i.e. it must /// either be a constant or an argument value). void InitTempAlloca(Address Alloca, llvm::Value *Value); /// CreateIRTemp - Create a temporary IR object of the given type, with /// appropriate alignment. This routine should only be used when an temporary /// value needs to be stored into an alloca (for example, to avoid explicit /// PHI construction), but the type is the IR type, not the type appropriate /// for storing in memory. /// /// That is, this is exactly equivalent to CreateMemTemp, but calling /// ConvertType instead of ConvertTypeForMem. Address CreateIRTemp(QualType T, const Twine &Name = "tmp"); /// CreateMemTemp - Create a temporary memory object of the given type, with /// appropriate alignment. Address CreateMemTemp(QualType T, const Twine &Name = "tmp"); Address CreateMemTemp(QualType T, CharUnits Align, const Twine &Name = "tmp"); /// CreateAggTemp - Create a temporary memory object for the given /// aggregate type. AggValueSlot CreateAggTemp(QualType T, const Twine &Name = "tmp") { return AggValueSlot::forAddr(CreateMemTemp(T, Name), T.getQualifiers(), AggValueSlot::IsNotDestructed, AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsNotAliased); } /// Emit a cast to void* in the appropriate address space. llvm::Value *EmitCastToVoidPtr(llvm::Value *value); /// EvaluateExprAsBool - Perform the usual unary conversions on the specified /// expression and compare the result against zero, returning an Int1Ty value. llvm::Value *EvaluateExprAsBool(const Expr *E); /// EmitIgnoredExpr - Emit an expression in a context which ignores the result. void EmitIgnoredExpr(const Expr *E); /// EmitAnyExpr - Emit code to compute the specified expression which can have /// any type. The result is returned as an RValue struct. If this is an /// aggregate expression, the aggloc/agglocvolatile arguments indicate where /// the result should be returned. /// /// \param ignoreResult True if the resulting value isn't used. RValue EmitAnyExpr(const Expr *E, AggValueSlot aggSlot = AggValueSlot::ignored(), bool ignoreResult = false); // EmitVAListRef - Emit a "reference" to a va_list; this is either the address // or the value of the expression, depending on how va_list is defined. Address EmitVAListRef(const Expr *E); /// Emit a "reference" to a __builtin_ms_va_list; this is /// always the value of the expression, because a __builtin_ms_va_list is a /// pointer to a char. Address EmitMSVAListRef(const Expr *E); /// EmitAnyExprToTemp - Similary to EmitAnyExpr(), however, the result will /// always be accessible even if no aggregate location is provided. RValue EmitAnyExprToTemp(const Expr *E); /// EmitAnyExprToMem - Emits the code necessary to evaluate an /// arbitrary expression into the given memory location. void EmitAnyExprToMem(const Expr *E, Address Location, Qualifiers Quals, bool IsInitializer); void EmitAnyExprToExn(const Expr *E, Address Addr); /// EmitExprAsInit - Emits the code necessary to initialize a /// location in memory with the given initializer. void EmitExprAsInit(const Expr *init, const ValueDecl *D, LValue lvalue, bool capturedByInit); /// hasVolatileMember - returns true if aggregate type has a volatile /// member. bool hasVolatileMember(QualType T) { if (const RecordType *RT = T->getAs()) { const RecordDecl *RD = cast(RT->getDecl()); return RD->hasVolatileMember(); } return false; } /// EmitAggregateCopy - Emit an aggregate assignment. /// /// The difference to EmitAggregateCopy is that tail padding is not copied. /// This is required for correctness when assigning non-POD structures in C++. void EmitAggregateAssign(Address DestPtr, Address SrcPtr, QualType EltTy) { bool IsVolatile = hasVolatileMember(EltTy); EmitAggregateCopy(DestPtr, SrcPtr, EltTy, IsVolatile, true); } void EmitAggregateCopyCtor(Address DestPtr, Address SrcPtr, QualType DestTy, QualType SrcTy) { EmitAggregateCopy(DestPtr, SrcPtr, SrcTy, /*IsVolatile=*/false, /*IsAssignment=*/false); } /// EmitAggregateCopy - Emit an aggregate copy. /// /// \param isVolatile - True iff either the source or the destination is /// volatile. /// \param isAssignment - If false, allow padding to be copied. This often /// yields more efficient. void EmitAggregateCopy(Address DestPtr, Address SrcPtr, QualType EltTy, bool isVolatile=false, bool isAssignment = false); /// GetAddrOfLocalVar - Return the address of a local variable. Address GetAddrOfLocalVar(const VarDecl *VD) { auto it = LocalDeclMap.find(VD); assert(it != LocalDeclMap.end() && "Invalid argument to GetAddrOfLocalVar(), no decl!"); return it->second; } /// getOpaqueLValueMapping - Given an opaque value expression (which /// must be mapped to an l-value), return its mapping. const LValue &getOpaqueLValueMapping(const OpaqueValueExpr *e) { assert(OpaqueValueMapping::shouldBindAsLValue(e)); llvm::DenseMap::iterator it = OpaqueLValues.find(e); assert(it != OpaqueLValues.end() && "no mapping for opaque value!"); return it->second; } /// getOpaqueRValueMapping - Given an opaque value expression (which /// must be mapped to an r-value), return its mapping. const RValue &getOpaqueRValueMapping(const OpaqueValueExpr *e) { assert(!OpaqueValueMapping::shouldBindAsLValue(e)); llvm::DenseMap::iterator it = OpaqueRValues.find(e); assert(it != OpaqueRValues.end() && "no mapping for opaque value!"); return it->second; } /// getAccessedFieldNo - Given an encoded value and a result number, return /// the input field number being accessed. static unsigned getAccessedFieldNo(unsigned Idx, const llvm::Constant *Elts); llvm::BlockAddress *GetAddrOfLabel(const LabelDecl *L); llvm::BasicBlock *GetIndirectGotoBlock(); /// EmitNullInitialization - Generate code to set a value of the given type to /// null, If the type contains data member pointers, they will be initialized /// to -1 in accordance with the Itanium C++ ABI. void EmitNullInitialization(Address DestPtr, QualType Ty); /// Emits a call to an LLVM variable-argument intrinsic, either /// \c llvm.va_start or \c llvm.va_end. /// \param ArgValue A reference to the \c va_list as emitted by either /// \c EmitVAListRef or \c EmitMSVAListRef. /// \param IsStart If \c true, emits a call to \c llvm.va_start; otherwise, /// calls \c llvm.va_end. llvm::Value *EmitVAStartEnd(llvm::Value *ArgValue, bool IsStart); /// Generate code to get an argument from the passed in pointer /// and update it accordingly. /// \param VE The \c VAArgExpr for which to generate code. /// \param VAListAddr Receives a reference to the \c va_list as emitted by /// either \c EmitVAListRef or \c EmitMSVAListRef. /// \returns A pointer to the argument. // FIXME: We should be able to get rid of this method and use the va_arg // instruction in LLVM instead once it works well enough. Address EmitVAArg(VAArgExpr *VE, Address &VAListAddr); /// emitArrayLength - Compute the length of an array, even if it's a /// VLA, and drill down to the base element type. llvm::Value *emitArrayLength(const ArrayType *arrayType, QualType &baseType, Address &addr); /// EmitVLASize - Capture all the sizes for the VLA expressions in /// the given variably-modified type and store them in the VLASizeMap. /// /// This function can be called with a null (unreachable) insert point. void EmitVariablyModifiedType(QualType Ty); /// getVLASize - Returns an LLVM value that corresponds to the size, /// in non-variably-sized elements, of a variable length array type, /// plus that largest non-variably-sized element type. Assumes that /// the type has already been emitted with EmitVariablyModifiedType. std::pair getVLASize(const VariableArrayType *vla); std::pair getVLASize(QualType vla); /// LoadCXXThis - Load the value of 'this'. This function is only valid while /// generating code for an C++ member function. llvm::Value *LoadCXXThis() { assert(CXXThisValue && "no 'this' value for this function"); return CXXThisValue; } Address LoadCXXThisAddress(); /// LoadCXXVTT - Load the VTT parameter to base constructors/destructors have /// virtual bases. // FIXME: Every place that calls LoadCXXVTT is something // that needs to be abstracted properly. llvm::Value *LoadCXXVTT() { assert(CXXStructorImplicitParamValue && "no VTT value for this function"); return CXXStructorImplicitParamValue; } /// GetAddressOfBaseOfCompleteClass - Convert the given pointer to a /// complete class to the given direct base. Address GetAddressOfDirectBaseInCompleteClass(Address Value, const CXXRecordDecl *Derived, const CXXRecordDecl *Base, bool BaseIsVirtual); static bool ShouldNullCheckClassCastValue(const CastExpr *Cast); /// GetAddressOfBaseClass - This function will add the necessary delta to the /// load of 'this' and returns address of the base class. Address GetAddressOfBaseClass(Address Value, const CXXRecordDecl *Derived, CastExpr::path_const_iterator PathBegin, CastExpr::path_const_iterator PathEnd, bool NullCheckValue, SourceLocation Loc); Address GetAddressOfDerivedClass(Address Value, const CXXRecordDecl *Derived, CastExpr::path_const_iterator PathBegin, CastExpr::path_const_iterator PathEnd, bool NullCheckValue); /// GetVTTParameter - Return the VTT parameter that should be passed to a /// base constructor/destructor with virtual bases. /// FIXME: VTTs are Itanium ABI-specific, so the definition should move /// to ItaniumCXXABI.cpp together with all the references to VTT. llvm::Value *GetVTTParameter(GlobalDecl GD, bool ForVirtualBase, bool Delegating); void EmitDelegateCXXConstructorCall(const CXXConstructorDecl *Ctor, CXXCtorType CtorType, const FunctionArgList &Args, SourceLocation Loc); // It's important not to confuse this and the previous function. Delegating // constructors are the C++0x feature. The constructor delegate optimization // is used to reduce duplication in the base and complete consturctors where // they are substantially the same. void EmitDelegatingCXXConstructorCall(const CXXConstructorDecl *Ctor, const FunctionArgList &Args); void EmitCXXConstructorCall(const CXXConstructorDecl *D, CXXCtorType Type, bool ForVirtualBase, bool Delegating, Address This, const CXXConstructExpr *E); /// Emit assumption load for all bases. Requires to be be called only on /// most-derived class and not under construction of the object. void EmitVTableAssumptionLoads(const CXXRecordDecl *ClassDecl, Address This); /// Emit assumption that vptr load == global vtable. void EmitVTableAssumptionLoad(const VPtr &vptr, Address This); void EmitSynthesizedCXXCopyCtorCall(const CXXConstructorDecl *D, Address This, Address Src, const CXXConstructExpr *E); void EmitCXXAggrConstructorCall(const CXXConstructorDecl *D, const ConstantArrayType *ArrayTy, Address ArrayPtr, const CXXConstructExpr *E, bool ZeroInitialization = false); void EmitCXXAggrConstructorCall(const CXXConstructorDecl *D, llvm::Value *NumElements, Address ArrayPtr, const CXXConstructExpr *E, bool ZeroInitialization = false); static Destroyer destroyCXXObject; void EmitCXXDestructorCall(const CXXDestructorDecl *D, CXXDtorType Type, bool ForVirtualBase, bool Delegating, Address This); void EmitNewArrayInitializer(const CXXNewExpr *E, QualType elementType, llvm::Type *ElementTy, Address NewPtr, llvm::Value *NumElements, llvm::Value *AllocSizeWithoutCookie); void EmitCXXTemporary(const CXXTemporary *Temporary, QualType TempType, Address Ptr); llvm::Value *EmitLifetimeStart(uint64_t Size, llvm::Value *Addr); void EmitLifetimeEnd(llvm::Value *Size, llvm::Value *Addr); llvm::Value *EmitCXXNewExpr(const CXXNewExpr *E); void EmitCXXDeleteExpr(const CXXDeleteExpr *E); void EmitDeleteCall(const FunctionDecl *DeleteFD, llvm::Value *Ptr, QualType DeleteTy); RValue EmitBuiltinNewDeleteCall(const FunctionProtoType *Type, const Expr *Arg, bool IsDelete); llvm::Value *EmitCXXTypeidExpr(const CXXTypeidExpr *E); llvm::Value *EmitDynamicCast(Address V, const CXXDynamicCastExpr *DCE); Address EmitCXXUuidofExpr(const CXXUuidofExpr *E); /// \brief Situations in which we might emit a check for the suitability of a /// pointer or glvalue. enum TypeCheckKind { /// Checking the operand of a load. Must be suitably sized and aligned. TCK_Load, /// Checking the destination of a store. Must be suitably sized and aligned. TCK_Store, /// Checking the bound value in a reference binding. Must be suitably sized /// and aligned, but is not required to refer to an object (until the /// reference is used), per core issue 453. TCK_ReferenceBinding, /// Checking the object expression in a non-static data member access. Must /// be an object within its lifetime. TCK_MemberAccess, /// Checking the 'this' pointer for a call to a non-static member function. /// Must be an object within its lifetime. TCK_MemberCall, /// Checking the 'this' pointer for a constructor call. TCK_ConstructorCall, /// Checking the operand of a static_cast to a derived pointer type. Must be /// null or an object within its lifetime. TCK_DowncastPointer, /// Checking the operand of a static_cast to a derived reference type. Must /// be an object within its lifetime. TCK_DowncastReference, /// Checking the operand of a cast to a base object. Must be suitably sized /// and aligned. TCK_Upcast, /// Checking the operand of a cast to a virtual base object. Must be an /// object within its lifetime. TCK_UpcastToVirtualBase }; /// \brief Whether any type-checking sanitizers are enabled. If \c false, /// calls to EmitTypeCheck can be skipped. bool sanitizePerformTypeCheck() const; /// \brief Emit a check that \p V is the address of storage of the /// appropriate size and alignment for an object of type \p Type. void EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc, llvm::Value *V, QualType Type, CharUnits Alignment = CharUnits::Zero(), bool SkipNullCheck = false); /// \brief Emit a check that \p Base points into an array object, which /// we can access at index \p Index. \p Accessed should be \c false if we /// this expression is used as an lvalue, for instance in "&Arr[Idx]". void EmitBoundsCheck(const Expr *E, const Expr *Base, llvm::Value *Index, QualType IndexType, bool Accessed); llvm::Value *EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, bool isInc, bool isPre); ComplexPairTy EmitComplexPrePostIncDec(const UnaryOperator *E, LValue LV, bool isInc, bool isPre); void EmitAlignmentAssumption(llvm::Value *PtrValue, unsigned Alignment, llvm::Value *OffsetValue = nullptr) { Builder.CreateAlignmentAssumption(CGM.getDataLayout(), PtrValue, Alignment, OffsetValue); } //===--------------------------------------------------------------------===// // Declaration Emission //===--------------------------------------------------------------------===// /// EmitDecl - Emit a declaration. /// /// This function can be called with a null (unreachable) insert point. void EmitDecl(const Decl &D); /// EmitVarDecl - Emit a local variable declaration. /// /// This function can be called with a null (unreachable) insert point. void EmitVarDecl(const VarDecl &D); void EmitScalarInit(const Expr *init, const ValueDecl *D, LValue lvalue, bool capturedByInit); void EmitScalarInit(llvm::Value *init, LValue lvalue); typedef void SpecialInitFn(CodeGenFunction &Init, const VarDecl &D, llvm::Value *Address); /// \brief Determine whether the given initializer is trivial in the sense /// that it requires no code to be generated. bool isTrivialInitializer(const Expr *Init); /// EmitAutoVarDecl - Emit an auto variable declaration. /// /// This function can be called with a null (unreachable) insert point. void EmitAutoVarDecl(const VarDecl &D); class AutoVarEmission { friend class CodeGenFunction; const VarDecl *Variable; /// The address of the alloca. Invalid if the variable was emitted /// as a global constant. Address Addr; llvm::Value *NRVOFlag; /// True if the variable is a __block variable. bool IsByRef; /// True if the variable is of aggregate type and has a constant /// initializer. bool IsConstantAggregate; /// Non-null if we should use lifetime annotations. llvm::Value *SizeForLifetimeMarkers; struct Invalid {}; AutoVarEmission(Invalid) : Variable(nullptr), Addr(Address::invalid()) {} AutoVarEmission(const VarDecl &variable) : Variable(&variable), Addr(Address::invalid()), NRVOFlag(nullptr), IsByRef(false), IsConstantAggregate(false), SizeForLifetimeMarkers(nullptr) {} bool wasEmittedAsGlobal() const { return !Addr.isValid(); } public: static AutoVarEmission invalid() { return AutoVarEmission(Invalid()); } bool useLifetimeMarkers() const { return SizeForLifetimeMarkers != nullptr; } llvm::Value *getSizeForLifetimeMarkers() const { assert(useLifetimeMarkers()); return SizeForLifetimeMarkers; } /// Returns the raw, allocated address, which is not necessarily /// the address of the object itself. Address getAllocatedAddress() const { return Addr; } /// Returns the address of the object within this declaration. /// Note that this does not chase the forwarding pointer for /// __block decls. Address getObjectAddress(CodeGenFunction &CGF) const { if (!IsByRef) return Addr; return CGF.emitBlockByrefAddress(Addr, Variable, /*forward*/ false); } }; AutoVarEmission EmitAutoVarAlloca(const VarDecl &var); void EmitAutoVarInit(const AutoVarEmission &emission); void EmitAutoVarCleanups(const AutoVarEmission &emission); void emitAutoVarTypeCleanup(const AutoVarEmission &emission, QualType::DestructionKind dtorKind); void EmitStaticVarDecl(const VarDecl &D, llvm::GlobalValue::LinkageTypes Linkage); class ParamValue { llvm::Value *Value; unsigned Alignment; ParamValue(llvm::Value *V, unsigned A) : Value(V), Alignment(A) {} public: static ParamValue forDirect(llvm::Value *value) { return ParamValue(value, 0); } static ParamValue forIndirect(Address addr) { assert(!addr.getAlignment().isZero()); return ParamValue(addr.getPointer(), addr.getAlignment().getQuantity()); } bool isIndirect() const { return Alignment != 0; } llvm::Value *getAnyValue() const { return Value; } llvm::Value *getDirectValue() const { assert(!isIndirect()); return Value; } Address getIndirectAddress() const { assert(isIndirect()); return Address(Value, CharUnits::fromQuantity(Alignment)); } }; /// EmitParmDecl - Emit a ParmVarDecl or an ImplicitParamDecl. void EmitParmDecl(const VarDecl &D, ParamValue Arg, unsigned ArgNo); /// protectFromPeepholes - Protect a value that we're intending to /// store to the side, but which will probably be used later, from /// aggressive peepholing optimizations that might delete it. /// /// Pass the result to unprotectFromPeepholes to declare that /// protection is no longer required. /// /// There's no particular reason why this shouldn't apply to /// l-values, it's just that no existing peepholes work on pointers. PeepholeProtection protectFromPeepholes(RValue rvalue); void unprotectFromPeepholes(PeepholeProtection protection); //===--------------------------------------------------------------------===// // Statement Emission //===--------------------------------------------------------------------===// /// EmitStopPoint - Emit a debug stoppoint if we are emitting debug info. void EmitStopPoint(const Stmt *S); /// EmitStmt - Emit the code for the statement \arg S. It is legal to call /// this function even if there is no current insertion point. /// /// This function may clear the current insertion point; callers should use /// EnsureInsertPoint if they wish to subsequently generate code without first /// calling EmitBlock, EmitBranch, or EmitStmt. void EmitStmt(const Stmt *S); /// EmitSimpleStmt - Try to emit a "simple" statement which does not /// necessarily require an insertion point or debug information; typically /// because the statement amounts to a jump or a container of other /// statements. /// /// \return True if the statement was handled. bool EmitSimpleStmt(const Stmt *S); Address EmitCompoundStmt(const CompoundStmt &S, bool GetLast = false, AggValueSlot AVS = AggValueSlot::ignored()); Address EmitCompoundStmtWithoutScope(const CompoundStmt &S, bool GetLast = false, AggValueSlot AVS = AggValueSlot::ignored()); /// EmitLabel - Emit the block for the given label. It is legal to call this /// function even if there is no current insertion point. void EmitLabel(const LabelDecl *D); // helper for EmitLabelStmt. void EmitLabelStmt(const LabelStmt &S); void EmitAttributedStmt(const AttributedStmt &S); void EmitGotoStmt(const GotoStmt &S); void EmitIndirectGotoStmt(const IndirectGotoStmt &S); void EmitIfStmt(const IfStmt &S); void EmitWhileStmt(const WhileStmt &S, ArrayRef Attrs = None); void EmitDoStmt(const DoStmt &S, ArrayRef Attrs = None); void EmitForStmt(const ForStmt &S, ArrayRef Attrs = None); void EmitReturnStmt(const ReturnStmt &S); void EmitDeclStmt(const DeclStmt &S); void EmitBreakStmt(const BreakStmt &S); void EmitContinueStmt(const ContinueStmt &S); void EmitSwitchStmt(const SwitchStmt &S); void EmitDefaultStmt(const DefaultStmt &S); void EmitCaseStmt(const CaseStmt &S); void EmitCaseStmtRange(const CaseStmt &S); void EmitAsmStmt(const AsmStmt &S); void EmitObjCForCollectionStmt(const ObjCForCollectionStmt &S); void EmitObjCAtTryStmt(const ObjCAtTryStmt &S); void EmitObjCAtThrowStmt(const ObjCAtThrowStmt &S); void EmitObjCAtSynchronizedStmt(const ObjCAtSynchronizedStmt &S); void EmitObjCAutoreleasePoolStmt(const ObjCAutoreleasePoolStmt &S); void EnterCXXTryStmt(const CXXTryStmt &S, bool IsFnTryBlock = false); void ExitCXXTryStmt(const CXXTryStmt &S, bool IsFnTryBlock = false); void EmitCXXTryStmt(const CXXTryStmt &S); void EmitSEHTryStmt(const SEHTryStmt &S); void EmitSEHLeaveStmt(const SEHLeaveStmt &S); void EnterSEHTryStmt(const SEHTryStmt &S); void ExitSEHTryStmt(const SEHTryStmt &S); void startOutlinedSEHHelper(CodeGenFunction &ParentCGF, bool IsFilter, const Stmt *OutlinedStmt); llvm::Function *GenerateSEHFilterFunction(CodeGenFunction &ParentCGF, const SEHExceptStmt &Except); llvm::Function *GenerateSEHFinallyFunction(CodeGenFunction &ParentCGF, const SEHFinallyStmt &Finally); void EmitSEHExceptionCodeSave(CodeGenFunction &ParentCGF, llvm::Value *ParentFP, llvm::Value *EntryEBP); llvm::Value *EmitSEHExceptionCode(); llvm::Value *EmitSEHExceptionInfo(); llvm::Value *EmitSEHAbnormalTermination(); /// Scan the outlined statement for captures from the parent function. For /// each capture, mark the capture as escaped and emit a call to /// llvm.localrecover. Insert the localrecover result into the LocalDeclMap. void EmitCapturedLocals(CodeGenFunction &ParentCGF, const Stmt *OutlinedStmt, bool IsFilter); /// Recovers the address of a local in a parent function. ParentVar is the /// address of the variable used in the immediate parent function. It can /// either be an alloca or a call to llvm.localrecover if there are nested /// outlined functions. ParentFP is the frame pointer of the outermost parent /// frame. Address recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF, Address ParentVar, llvm::Value *ParentFP); void EmitCXXForRangeStmt(const CXXForRangeStmt &S, ArrayRef Attrs = None); LValue InitCapturedStruct(const CapturedStmt &S); llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K); llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S); Address GenerateCapturedStmtArgument(const CapturedStmt &S); llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S); void GenerateOpenMPCapturedVars(const CapturedStmt &S, SmallVectorImpl &CapturedVars); + void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy, + SourceLocation Loc); /// \brief Perform element by element copying of arrays with type \a /// OriginalType from \a SrcAddr to \a DestAddr using copying procedure /// generated by \a CopyGen. /// /// \param DestAddr Address of the destination array. /// \param SrcAddr Address of the source array. /// \param OriginalType Type of destination and source arrays. /// \param CopyGen Copying procedure that copies value of single array element /// to another single array element. void EmitOMPAggregateAssign( Address DestAddr, Address SrcAddr, QualType OriginalType, const llvm::function_ref &CopyGen); /// \brief Emit proper copying of data from one variable to another. /// /// \param OriginalType Original type of the copied variables. /// \param DestAddr Destination address. /// \param SrcAddr Source address. /// \param DestVD Destination variable used in \a CopyExpr (for arrays, has /// type of the base array element). /// \param SrcVD Source variable used in \a CopyExpr (for arrays, has type of /// the base array element). /// \param Copy Actual copygin expression for copying data from \a SrcVD to \a /// DestVD. void EmitOMPCopy(QualType OriginalType, Address DestAddr, Address SrcAddr, const VarDecl *DestVD, const VarDecl *SrcVD, const Expr *Copy); /// \brief Emit atomic update code for constructs: \a X = \a X \a BO \a E or /// \a X = \a E \a BO \a E. /// /// \param X Value to be updated. /// \param E Update value. /// \param BO Binary operation for update operation. /// \param IsXLHSInRHSPart true if \a X is LHS in RHS part of the update /// expression, false otherwise. /// \param AO Atomic ordering of the generated atomic instructions. /// \param CommonGen Code generator for complex expressions that cannot be /// expressed through atomicrmw instruction. /// \returns if simple 'atomicrmw' instruction was /// generated, otherwise. std::pair EmitOMPAtomicSimpleUpdateExpr( LValue X, RValue E, BinaryOperatorKind BO, bool IsXLHSInRHSPart, llvm::AtomicOrdering AO, SourceLocation Loc, const llvm::function_ref &CommonGen); bool EmitOMPFirstprivateClause(const OMPExecutableDirective &D, OMPPrivateScope &PrivateScope); void EmitOMPPrivateClause(const OMPExecutableDirective &D, OMPPrivateScope &PrivateScope); /// \brief Emit code for copyin clause in \a D directive. The next code is /// generated at the start of outlined functions for directives: /// \code /// threadprivate_var1 = master_threadprivate_var1; /// operator=(threadprivate_var2, master_threadprivate_var2); /// ... /// __kmpc_barrier(&loc, global_tid); /// \endcode /// /// \param D OpenMP directive possibly with 'copyin' clause(s). /// \returns true if at least one copyin variable is found, false otherwise. bool EmitOMPCopyinClause(const OMPExecutableDirective &D); /// \brief Emit initial code for lastprivate variables. If some variable is /// not also firstprivate, then the default initialization is used. Otherwise /// initialization of this variable is performed by EmitOMPFirstprivateClause /// method. /// /// \param D Directive that may have 'lastprivate' directives. /// \param PrivateScope Private scope for capturing lastprivate variables for /// proper codegen in internal captured statement. /// /// \returns true if there is at least one lastprivate variable, false /// otherwise. bool EmitOMPLastprivateClauseInit(const OMPExecutableDirective &D, OMPPrivateScope &PrivateScope); /// \brief Emit final copying of lastprivate values to original variables at /// the end of the worksharing or simd directive. /// /// \param D Directive that has at least one 'lastprivate' directives. /// \param IsLastIterCond Boolean condition that must be set to 'i1 true' if /// it is the last iteration of the loop code in associated directive, or to /// 'i1 false' otherwise. If this item is nullptr, no final check is required. void EmitOMPLastprivateClauseFinal(const OMPExecutableDirective &D, llvm::Value *IsLastIterCond = nullptr); /// \brief Emit initial code for reduction variables. Creates reduction copies /// and initializes them with the values according to OpenMP standard. /// /// \param D Directive (possibly) with the 'reduction' clause. /// \param PrivateScope Private scope for capturing reduction variables for /// proper codegen in internal captured statement. /// void EmitOMPReductionClauseInit(const OMPExecutableDirective &D, OMPPrivateScope &PrivateScope); /// \brief Emit final update of reduction values to original variables at /// the end of the directive. /// /// \param D Directive that has at least one 'reduction' directives. void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D); /// \brief Emit initial code for linear variables. Creates private copies /// and initializes them with the values according to OpenMP standard. /// /// \param D Directive (possibly) with the 'linear' clause. void EmitOMPLinearClauseInit(const OMPLoopDirective &D); void EmitOMPParallelDirective(const OMPParallelDirective &S); void EmitOMPSimdDirective(const OMPSimdDirective &S); void EmitOMPForDirective(const OMPForDirective &S); void EmitOMPForSimdDirective(const OMPForSimdDirective &S); void EmitOMPSectionsDirective(const OMPSectionsDirective &S); void EmitOMPSectionDirective(const OMPSectionDirective &S); void EmitOMPSingleDirective(const OMPSingleDirective &S); void EmitOMPMasterDirective(const OMPMasterDirective &S); void EmitOMPCriticalDirective(const OMPCriticalDirective &S); void EmitOMPParallelForDirective(const OMPParallelForDirective &S); void EmitOMPParallelForSimdDirective(const OMPParallelForSimdDirective &S); void EmitOMPParallelSectionsDirective(const OMPParallelSectionsDirective &S); void EmitOMPTaskDirective(const OMPTaskDirective &S); void EmitOMPTaskyieldDirective(const OMPTaskyieldDirective &S); void EmitOMPBarrierDirective(const OMPBarrierDirective &S); void EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S); void EmitOMPTaskgroupDirective(const OMPTaskgroupDirective &S); void EmitOMPFlushDirective(const OMPFlushDirective &S); void EmitOMPOrderedDirective(const OMPOrderedDirective &S); void EmitOMPAtomicDirective(const OMPAtomicDirective &S); void EmitOMPTargetDirective(const OMPTargetDirective &S); void EmitOMPTargetDataDirective(const OMPTargetDataDirective &S); void EmitOMPTeamsDirective(const OMPTeamsDirective &S); void EmitOMPCancellationPointDirective(const OMPCancellationPointDirective &S); void EmitOMPCancelDirective(const OMPCancelDirective &S); void EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S); void EmitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective &S); void EmitOMPDistributeDirective(const OMPDistributeDirective &S); /// \brief Emit inner loop of the worksharing/simd construct. /// /// \param S Directive, for which the inner loop must be emitted. /// \param RequiresCleanup true, if directive has some associated private /// variables. /// \param LoopCond Bollean condition for loop continuation. /// \param IncExpr Increment expression for loop control variable. /// \param BodyGen Generator for the inner body of the inner loop. /// \param PostIncGen Genrator for post-increment code (required for ordered /// loop directvies). void EmitOMPInnerLoop( const Stmt &S, bool RequiresCleanup, const Expr *LoopCond, const Expr *IncExpr, const llvm::function_ref &BodyGen, const llvm::function_ref &PostIncGen); JumpDest getOMPCancelDestination(OpenMPDirectiveKind Kind); private: /// Helpers for the OpenMP loop directives. void EmitOMPLoopBody(const OMPLoopDirective &D, JumpDest LoopExit); void EmitOMPSimdInit(const OMPLoopDirective &D, bool IsMonotonic = false); void EmitOMPSimdFinal(const OMPLoopDirective &D); /// \brief Emit code for the worksharing loop-based directive. /// \return true, if this construct has any lastprivate clause, false - /// otherwise. bool EmitOMPWorksharingLoop(const OMPLoopDirective &S); void EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind, bool IsMonotonic, const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered, Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk); /// \brief Emit code for sections directive. OpenMPDirectiveKind EmitSections(const OMPExecutableDirective &S); public: //===--------------------------------------------------------------------===// // LValue Expression Emission //===--------------------------------------------------------------------===// /// GetUndefRValue - Get an appropriate 'undef' rvalue for the given type. RValue GetUndefRValue(QualType Ty); /// EmitUnsupportedRValue - Emit a dummy r-value using the type of E /// and issue an ErrorUnsupported style diagnostic (using the /// provided Name). RValue EmitUnsupportedRValue(const Expr *E, const char *Name); /// EmitUnsupportedLValue - Emit a dummy l-value using the type of E and issue /// an ErrorUnsupported style diagnostic (using the provided Name). LValue EmitUnsupportedLValue(const Expr *E, const char *Name); /// EmitLValue - Emit code to compute a designator that specifies the location /// of the expression. /// /// This can return one of two things: a simple address or a bitfield /// reference. In either case, the LLVM Value* in the LValue structure is /// guaranteed to be an LLVM pointer type. /// /// If this returns a bitfield reference, nothing about the pointee type of /// the LLVM value is known: For example, it may not be a pointer to an /// integer. /// /// If this returns a normal address, and if the lvalue's C type is fixed /// size, this method guarantees that the returned pointer type will point to /// an LLVM type of the same size of the lvalue's type. If the lvalue has a /// variable length type, this is not possible. /// LValue EmitLValue(const Expr *E); /// \brief Same as EmitLValue but additionally we generate checking code to /// guard against undefined behavior. This is only suitable when we know /// that the address will be used to access the object. LValue EmitCheckedLValue(const Expr *E, TypeCheckKind TCK); RValue convertTempToRValue(Address addr, QualType type, SourceLocation Loc); void EmitAtomicInit(Expr *E, LValue lvalue); bool LValueIsSuitableForInlineAtomic(LValue Src); bool typeIsSuitableForInlineAtomic(QualType Ty, bool IsVolatile) const; RValue EmitAtomicLoad(LValue LV, SourceLocation SL, AggValueSlot Slot = AggValueSlot::ignored()); RValue EmitAtomicLoad(LValue lvalue, SourceLocation loc, llvm::AtomicOrdering AO, bool IsVolatile = false, AggValueSlot slot = AggValueSlot::ignored()); void EmitAtomicStore(RValue rvalue, LValue lvalue, bool isInit); void EmitAtomicStore(RValue rvalue, LValue lvalue, llvm::AtomicOrdering AO, bool IsVolatile, bool isInit); std::pair EmitAtomicCompareExchange( LValue Obj, RValue Expected, RValue Desired, SourceLocation Loc, llvm::AtomicOrdering Success = llvm::SequentiallyConsistent, llvm::AtomicOrdering Failure = llvm::SequentiallyConsistent, bool IsWeak = false, AggValueSlot Slot = AggValueSlot::ignored()); void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO, const llvm::function_ref &UpdateOp, bool IsVolatile); /// EmitToMemory - Change a scalar value from its value /// representation to its in-memory representation. llvm::Value *EmitToMemory(llvm::Value *Value, QualType Ty); /// EmitFromMemory - Change a scalar value from its memory /// representation to its value representation. llvm::Value *EmitFromMemory(llvm::Value *Value, QualType Ty); /// EmitLoadOfScalar - Load a scalar value from an address, taking /// care to appropriately convert from the memory representation to /// the LLVM value representation. llvm::Value *EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource AlignSource = AlignmentSource::Type, llvm::MDNode *TBAAInfo = nullptr, QualType TBAABaseTy = QualType(), uint64_t TBAAOffset = 0, bool isNontemporal = false); /// EmitLoadOfScalar - Load a scalar value from an address, taking /// care to appropriately convert from the memory representation to /// the LLVM value representation. The l-value must be a simple /// l-value. llvm::Value *EmitLoadOfScalar(LValue lvalue, SourceLocation Loc); /// EmitStoreOfScalar - Store a scalar value to an address, taking /// care to appropriately convert from the memory representation to /// the LLVM value representation. void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource AlignSource = AlignmentSource::Type, llvm::MDNode *TBAAInfo = nullptr, bool isInit = false, QualType TBAABaseTy = QualType(), uint64_t TBAAOffset = 0, bool isNontemporal = false); /// EmitStoreOfScalar - Store a scalar value to an address, taking /// care to appropriately convert from the memory representation to /// the LLVM value representation. The l-value must be a simple /// l-value. The isInit flag indicates whether this is an initialization. /// If so, atomic qualifiers are ignored and the store is always non-atomic. void EmitStoreOfScalar(llvm::Value *value, LValue lvalue, bool isInit=false); /// EmitLoadOfLValue - Given an expression that represents a value lvalue, /// this method emits the address of the lvalue, then loads the result as an /// rvalue, returning the rvalue. RValue EmitLoadOfLValue(LValue V, SourceLocation Loc); RValue EmitLoadOfExtVectorElementLValue(LValue V); RValue EmitLoadOfBitfieldLValue(LValue LV); RValue EmitLoadOfGlobalRegLValue(LValue LV); /// EmitStoreThroughLValue - Store the specified rvalue into the specified /// lvalue, where both are guaranteed to the have the same type, and that type /// is 'Ty'. void EmitStoreThroughLValue(RValue Src, LValue Dst, bool isInit = false); void EmitStoreThroughExtVectorComponentLValue(RValue Src, LValue Dst); void EmitStoreThroughGlobalRegLValue(RValue Src, LValue Dst); /// EmitStoreThroughBitfieldLValue - Store Src into Dst with same constraints /// as EmitStoreThroughLValue. /// /// \param Result [out] - If non-null, this will be set to a Value* for the /// bit-field contents after the store, appropriate for use as the result of /// an assignment to the bit-field. void EmitStoreThroughBitfieldLValue(RValue Src, LValue Dst, llvm::Value **Result=nullptr); /// Emit an l-value for an assignment (simple or compound) of complex type. LValue EmitComplexAssignmentLValue(const BinaryOperator *E); LValue EmitComplexCompoundAssignmentLValue(const CompoundAssignOperator *E); LValue EmitScalarCompoundAssignWithComplex(const CompoundAssignOperator *E, llvm::Value *&Result); // Note: only available for agg return types LValue EmitBinaryOperatorLValue(const BinaryOperator *E); LValue EmitCompoundAssignmentLValue(const CompoundAssignOperator *E); // Note: only available for agg return types LValue EmitCallExprLValue(const CallExpr *E); // Note: only available for agg return types LValue EmitVAArgExprLValue(const VAArgExpr *E); LValue EmitDeclRefLValue(const DeclRefExpr *E); LValue EmitStringLiteralLValue(const StringLiteral *E); LValue EmitObjCEncodeExprLValue(const ObjCEncodeExpr *E); LValue EmitPredefinedLValue(const PredefinedExpr *E); LValue EmitUnaryOpLValue(const UnaryOperator *E); LValue EmitArraySubscriptExpr(const ArraySubscriptExpr *E, bool Accessed = false); LValue EmitOMPArraySectionExpr(const OMPArraySectionExpr *E, bool IsLowerBound = true); LValue EmitExtVectorElementExpr(const ExtVectorElementExpr *E); LValue EmitMemberExpr(const MemberExpr *E); LValue EmitObjCIsaExpr(const ObjCIsaExpr *E); LValue EmitCompoundLiteralLValue(const CompoundLiteralExpr *E); LValue EmitInitListLValue(const InitListExpr *E); LValue EmitConditionalOperatorLValue(const AbstractConditionalOperator *E); LValue EmitCastLValue(const CastExpr *E); LValue EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *E); LValue EmitOpaqueValueLValue(const OpaqueValueExpr *e); Address EmitExtVectorElementLValue(LValue V); RValue EmitRValueForField(LValue LV, const FieldDecl *FD, SourceLocation Loc); Address EmitArrayToPointerDecay(const Expr *Array, AlignmentSource *AlignSource = nullptr); class ConstantEmission { llvm::PointerIntPair ValueAndIsReference; ConstantEmission(llvm::Constant *C, bool isReference) : ValueAndIsReference(C, isReference) {} public: ConstantEmission() {} static ConstantEmission forReference(llvm::Constant *C) { return ConstantEmission(C, true); } static ConstantEmission forValue(llvm::Constant *C) { return ConstantEmission(C, false); } explicit operator bool() const { return ValueAndIsReference.getOpaqueValue() != nullptr; } bool isReference() const { return ValueAndIsReference.getInt(); } LValue getReferenceLValue(CodeGenFunction &CGF, Expr *refExpr) const { assert(isReference()); return CGF.MakeNaturalAlignAddrLValue(ValueAndIsReference.getPointer(), refExpr->getType()); } llvm::Constant *getValue() const { assert(!isReference()); return ValueAndIsReference.getPointer(); } }; ConstantEmission tryEmitAsConstant(DeclRefExpr *refExpr); RValue EmitPseudoObjectRValue(const PseudoObjectExpr *e, AggValueSlot slot = AggValueSlot::ignored()); LValue EmitPseudoObjectLValue(const PseudoObjectExpr *e); llvm::Value *EmitIvarOffset(const ObjCInterfaceDecl *Interface, const ObjCIvarDecl *Ivar); LValue EmitLValueForField(LValue Base, const FieldDecl* Field); LValue EmitLValueForLambdaField(const FieldDecl *Field); /// EmitLValueForFieldInitialization - Like EmitLValueForField, except that /// if the Field is a reference, this will return the address of the reference /// and not the address of the value stored in the reference. LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl* Field); LValue EmitLValueForIvar(QualType ObjectTy, llvm::Value* Base, const ObjCIvarDecl *Ivar, unsigned CVRQualifiers); LValue EmitCXXConstructLValue(const CXXConstructExpr *E); LValue EmitCXXBindTemporaryLValue(const CXXBindTemporaryExpr *E); LValue EmitLambdaLValue(const LambdaExpr *E); LValue EmitCXXTypeidLValue(const CXXTypeidExpr *E); LValue EmitCXXUuidofLValue(const CXXUuidofExpr *E); LValue EmitObjCMessageExprLValue(const ObjCMessageExpr *E); LValue EmitObjCIvarRefLValue(const ObjCIvarRefExpr *E); LValue EmitStmtExprLValue(const StmtExpr *E); LValue EmitPointerToDataMemberBinaryExpr(const BinaryOperator *E); LValue EmitObjCSelectorLValue(const ObjCSelectorExpr *E); void EmitDeclRefExprDbgValue(const DeclRefExpr *E, llvm::Constant *Init); //===--------------------------------------------------------------------===// // Scalar Expression Emission //===--------------------------------------------------------------------===// /// EmitCall - Generate a call of the given function, expecting the given /// result type, and using the given argument list which specifies both the /// LLVM arguments and the types they were derived from. RValue EmitCall(const CGFunctionInfo &FnInfo, llvm::Value *Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, CGCalleeInfo CalleeInfo = CGCalleeInfo(), llvm::Instruction **callOrInvoke = nullptr); RValue EmitCall(QualType FnType, llvm::Value *Callee, const CallExpr *E, ReturnValueSlot ReturnValue, CGCalleeInfo CalleeInfo = CGCalleeInfo(), llvm::Value *Chain = nullptr); RValue EmitCallExpr(const CallExpr *E, ReturnValueSlot ReturnValue = ReturnValueSlot()); void checkTargetFeatures(const CallExpr *E, const FunctionDecl *TargetDecl); llvm::CallInst *EmitRuntimeCall(llvm::Value *callee, const Twine &name = ""); llvm::CallInst *EmitRuntimeCall(llvm::Value *callee, ArrayRef args, const Twine &name = ""); llvm::CallInst *EmitNounwindRuntimeCall(llvm::Value *callee, const Twine &name = ""); llvm::CallInst *EmitNounwindRuntimeCall(llvm::Value *callee, ArrayRef args, const Twine &name = ""); llvm::CallSite EmitCallOrInvoke(llvm::Value *Callee, ArrayRef Args, const Twine &Name = ""); llvm::CallSite EmitRuntimeCallOrInvoke(llvm::Value *callee, ArrayRef args, const Twine &name = ""); llvm::CallSite EmitRuntimeCallOrInvoke(llvm::Value *callee, const Twine &name = ""); void EmitNoreturnRuntimeCallOrInvoke(llvm::Value *callee, ArrayRef args); llvm::Value *BuildAppleKextVirtualCall(const CXXMethodDecl *MD, NestedNameSpecifier *Qual, llvm::Type *Ty); llvm::Value *BuildAppleKextVirtualDestructorCall(const CXXDestructorDecl *DD, CXXDtorType Type, const CXXRecordDecl *RD); RValue EmitCXXMemberOrOperatorCall(const CXXMethodDecl *MD, llvm::Value *Callee, ReturnValueSlot ReturnValue, llvm::Value *This, llvm::Value *ImplicitParam, QualType ImplicitParamTy, const CallExpr *E); RValue EmitCXXStructorCall(const CXXMethodDecl *MD, llvm::Value *Callee, ReturnValueSlot ReturnValue, llvm::Value *This, llvm::Value *ImplicitParam, QualType ImplicitParamTy, const CallExpr *E, StructorType Type); RValue EmitCXXMemberCallExpr(const CXXMemberCallExpr *E, ReturnValueSlot ReturnValue); RValue EmitCXXMemberOrOperatorMemberCallExpr(const CallExpr *CE, const CXXMethodDecl *MD, ReturnValueSlot ReturnValue, bool HasQualifier, NestedNameSpecifier *Qualifier, bool IsArrow, const Expr *Base); // Compute the object pointer. Address EmitCXXMemberDataPointerAddress(const Expr *E, Address base, llvm::Value *memberPtr, const MemberPointerType *memberPtrType, AlignmentSource *AlignSource = nullptr); RValue EmitCXXMemberPointerCallExpr(const CXXMemberCallExpr *E, ReturnValueSlot ReturnValue); RValue EmitCXXOperatorMemberCallExpr(const CXXOperatorCallExpr *E, const CXXMethodDecl *MD, ReturnValueSlot ReturnValue); RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue); RValue EmitBuiltinExpr(const FunctionDecl *FD, unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue); RValue EmitBlockCallExpr(const CallExpr *E, ReturnValueSlot ReturnValue); /// EmitTargetBuiltinExpr - Emit the given builtin call. Returns 0 if the call /// is unhandled by the current target. llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty, const llvm::CmpInst::Predicate Fp, const llvm::CmpInst::Predicate Ip, const llvm::Twine &Name = ""); llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic, const char *NameHint, unsigned Modifier, const CallExpr *E, SmallVectorImpl &Ops, Address PtrOp0, Address PtrOp1); llvm::Function *LookupNeonLLVMIntrinsic(unsigned IntrinsicID, unsigned Modifier, llvm::Type *ArgTy, const CallExpr *E); llvm::Value *EmitNeonCall(llvm::Function *F, SmallVectorImpl &O, const char *name, unsigned shift = 0, bool rightshift = false); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); llvm::Value *EmitNeonShiftVector(llvm::Value *V, llvm::Type *Ty, bool negateForRightShift); llvm::Value *EmitNeonRShiftImm(llvm::Value *Vec, llvm::Value *Amt, llvm::Type *Ty, bool usgn, const char *name); llvm::Value *vectorWrapScalar16(llvm::Value *Op); llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *BuildVector(ArrayRef Ops); llvm::Value *EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitObjCProtocolExpr(const ObjCProtocolExpr *E); llvm::Value *EmitObjCStringLiteral(const ObjCStringLiteral *E); llvm::Value *EmitObjCBoxedExpr(const ObjCBoxedExpr *E); llvm::Value *EmitObjCArrayLiteral(const ObjCArrayLiteral *E); llvm::Value *EmitObjCDictionaryLiteral(const ObjCDictionaryLiteral *E); llvm::Value *EmitObjCCollectionLiteral(const Expr *E, const ObjCMethodDecl *MethodWithObjects); llvm::Value *EmitObjCSelectorExpr(const ObjCSelectorExpr *E); RValue EmitObjCMessageExpr(const ObjCMessageExpr *E, ReturnValueSlot Return = ReturnValueSlot()); /// Retrieves the default cleanup kind for an ARC cleanup. /// Except under -fobjc-arc-eh, ARC cleanups are normal-only. CleanupKind getARCCleanupKind() { return CGM.getCodeGenOpts().ObjCAutoRefCountExceptions ? NormalAndEHCleanup : NormalCleanup; } // ARC primitives. void EmitARCInitWeak(Address addr, llvm::Value *value); void EmitARCDestroyWeak(Address addr); llvm::Value *EmitARCLoadWeak(Address addr); llvm::Value *EmitARCLoadWeakRetained(Address addr); llvm::Value *EmitARCStoreWeak(Address addr, llvm::Value *value, bool ignored); void EmitARCCopyWeak(Address dst, Address src); void EmitARCMoveWeak(Address dst, Address src); llvm::Value *EmitARCRetainAutorelease(QualType type, llvm::Value *value); llvm::Value *EmitARCRetainAutoreleaseNonBlock(llvm::Value *value); llvm::Value *EmitARCStoreStrong(LValue lvalue, llvm::Value *value, bool resultIgnored); llvm::Value *EmitARCStoreStrongCall(Address addr, llvm::Value *value, bool resultIgnored); llvm::Value *EmitARCRetain(QualType type, llvm::Value *value); llvm::Value *EmitARCRetainNonBlock(llvm::Value *value); llvm::Value *EmitARCRetainBlock(llvm::Value *value, bool mandatory); void EmitARCDestroyStrong(Address addr, ARCPreciseLifetime_t precise); void EmitARCRelease(llvm::Value *value, ARCPreciseLifetime_t precise); llvm::Value *EmitARCAutorelease(llvm::Value *value); llvm::Value *EmitARCAutoreleaseReturnValue(llvm::Value *value); llvm::Value *EmitARCRetainAutoreleaseReturnValue(llvm::Value *value); llvm::Value *EmitARCRetainAutoreleasedReturnValue(llvm::Value *value); std::pair EmitARCStoreAutoreleasing(const BinaryOperator *e); std::pair EmitARCStoreStrong(const BinaryOperator *e, bool ignored); llvm::Value *EmitObjCThrowOperand(const Expr *expr); llvm::Value *EmitObjCConsumeObject(QualType T, llvm::Value *Ptr); llvm::Value *EmitObjCExtendObjectLifetime(QualType T, llvm::Value *Ptr); llvm::Value *EmitARCExtendBlockObject(const Expr *expr); llvm::Value *EmitARCRetainScalarExpr(const Expr *expr); llvm::Value *EmitARCRetainAutoreleaseScalarExpr(const Expr *expr); void EmitARCIntrinsicUse(ArrayRef values); static Destroyer destroyARCStrongImprecise; static Destroyer destroyARCStrongPrecise; static Destroyer destroyARCWeak; void EmitObjCAutoreleasePoolPop(llvm::Value *Ptr); llvm::Value *EmitObjCAutoreleasePoolPush(); llvm::Value *EmitObjCMRRAutoreleasePoolPush(); void EmitObjCAutoreleasePoolCleanup(llvm::Value *Ptr); void EmitObjCMRRAutoreleasePoolPop(llvm::Value *Ptr); /// \brief Emits a reference binding to the passed in expression. RValue EmitReferenceBindingToExpr(const Expr *E); //===--------------------------------------------------------------------===// // Expression Emission //===--------------------------------------------------------------------===// // Expressions are broken into three classes: scalar, complex, aggregate. /// EmitScalarExpr - Emit the computation of the specified expression of LLVM /// scalar type, returning the result. llvm::Value *EmitScalarExpr(const Expr *E , bool IgnoreResultAssign = false); /// Emit a conversion from the specified type to the specified destination /// type, both of which are LLVM scalar types. llvm::Value *EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc); /// Emit a conversion from the specified complex type to the specified /// destination type, where the destination type is an LLVM scalar type. llvm::Value *EmitComplexToScalarConversion(ComplexPairTy Src, QualType SrcTy, QualType DstTy, SourceLocation Loc); /// EmitAggExpr - Emit the computation of the specified expression /// of aggregate type. The result is computed into the given slot, /// which may be null to indicate that the value is not needed. void EmitAggExpr(const Expr *E, AggValueSlot AS); /// EmitAggExprToLValue - Emit the computation of the specified expression of /// aggregate type into a temporary LValue. LValue EmitAggExprToLValue(const Expr *E); /// EmitExtendGCLifetime - Given a pointer to an Objective-C object, /// make sure it survives garbage collection until this point. void EmitExtendGCLifetime(llvm::Value *object); /// EmitComplexExpr - Emit the computation of the specified expression of /// complex type, returning the result. ComplexPairTy EmitComplexExpr(const Expr *E, bool IgnoreReal = false, bool IgnoreImag = false); /// EmitComplexExprIntoLValue - Emit the given expression of complex /// type and place its result into the specified l-value. void EmitComplexExprIntoLValue(const Expr *E, LValue dest, bool isInit); /// EmitStoreOfComplex - Store a complex number into the specified l-value. void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit); /// EmitLoadOfComplex - Load a complex number from the specified l-value. ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc); Address emitAddrOfRealComponent(Address complex, QualType complexType); Address emitAddrOfImagComponent(Address complex, QualType complexType); /// AddInitializerToStaticVarDecl - Add the initializer for 'D' to the /// global variable that has already been created for it. If the initializer /// has a different type than GV does, this may free GV and return a different /// one. Otherwise it just returns GV. llvm::GlobalVariable * AddInitializerToStaticVarDecl(const VarDecl &D, llvm::GlobalVariable *GV); /// EmitCXXGlobalVarDeclInit - Create the initializer for a C++ /// variable with global storage. void EmitCXXGlobalVarDeclInit(const VarDecl &D, llvm::Constant *DeclPtr, bool PerformInit); llvm::Constant *createAtExitStub(const VarDecl &VD, llvm::Constant *Dtor, llvm::Constant *Addr); /// Call atexit() with a function that passes the given argument to /// the given function. void registerGlobalDtorWithAtExit(const VarDecl &D, llvm::Constant *fn, llvm::Constant *addr); /// Emit code in this function to perform a guarded variable /// initialization. Guarded initializations are used when it's not /// possible to prove that an initialization will be done exactly /// once, e.g. with a static local variable or a static data member /// of a class template. void EmitCXXGuardedInit(const VarDecl &D, llvm::GlobalVariable *DeclPtr, bool PerformInit); /// GenerateCXXGlobalInitFunc - Generates code for initializing global /// variables. void GenerateCXXGlobalInitFunc(llvm::Function *Fn, ArrayRef CXXThreadLocals, Address Guard = Address::invalid()); /// GenerateCXXGlobalDtorsFunc - Generates code for destroying global /// variables. void GenerateCXXGlobalDtorsFunc(llvm::Function *Fn, const std::vector > &DtorsAndObjects); void GenerateCXXGlobalVarDeclInitFunc(llvm::Function *Fn, const VarDecl *D, llvm::GlobalVariable *Addr, bool PerformInit); void EmitCXXConstructExpr(const CXXConstructExpr *E, AggValueSlot Dest); void EmitSynthesizedCXXCopyCtor(Address Dest, Address Src, const Expr *Exp); void enterFullExpression(const ExprWithCleanups *E) { if (E->getNumObjects() == 0) return; enterNonTrivialFullExpression(E); } void enterNonTrivialFullExpression(const ExprWithCleanups *E); void EmitCXXThrowExpr(const CXXThrowExpr *E, bool KeepInsertionPoint = true); void EmitLambdaExpr(const LambdaExpr *E, AggValueSlot Dest); RValue EmitAtomicExpr(AtomicExpr *E); //===--------------------------------------------------------------------===// // Annotations Emission //===--------------------------------------------------------------------===// /// Emit an annotation call (intrinsic or builtin). llvm::Value *EmitAnnotationCall(llvm::Value *AnnotationFn, llvm::Value *AnnotatedVal, StringRef AnnotationStr, SourceLocation Location); /// Emit local annotations for the local variable V, declared by D. void EmitVarAnnotations(const VarDecl *D, llvm::Value *V); /// Emit field annotations for the given field & value. Returns the /// annotation result. Address EmitFieldAnnotations(const FieldDecl *D, Address V); //===--------------------------------------------------------------------===// // Internal Helpers //===--------------------------------------------------------------------===// /// ContainsLabel - Return true if the statement contains a label in it. If /// this statement is not executed normally, it not containing a label means /// that we can just remove the code. static bool ContainsLabel(const Stmt *S, bool IgnoreCaseStmts = false); /// containsBreak - Return true if the statement contains a break out of it. /// If the statement (recursively) contains a switch or loop with a break /// inside of it, this is fine. static bool containsBreak(const Stmt *S); /// ConstantFoldsToSimpleInteger - If the specified expression does not fold /// to a constant, or if it does but contains a label, return false. If it /// constant folds return true and set the boolean result in Result. bool ConstantFoldsToSimpleInteger(const Expr *Cond, bool &Result); /// ConstantFoldsToSimpleInteger - If the specified expression does not fold /// to a constant, or if it does but contains a label, return false. If it /// constant folds return true and set the folded value. bool ConstantFoldsToSimpleInteger(const Expr *Cond, llvm::APSInt &Result); /// EmitBranchOnBoolExpr - Emit a branch on a boolean condition (e.g. for an /// if statement) to the specified blocks. Based on the condition, this might /// try to simplify the codegen of the conditional based on the branch. /// TrueCount should be the number of times we expect the condition to /// evaluate to true based on PGO data. void EmitBranchOnBoolExpr(const Expr *Cond, llvm::BasicBlock *TrueBlock, llvm::BasicBlock *FalseBlock, uint64_t TrueCount); /// \brief Emit a description of a type in a format suitable for passing to /// a runtime sanitizer handler. llvm::Constant *EmitCheckTypeDescriptor(QualType T); /// \brief Convert a value into a format suitable for passing to a runtime /// sanitizer handler. llvm::Value *EmitCheckValue(llvm::Value *V); /// \brief Emit a description of a source location in a format suitable for /// passing to a runtime sanitizer handler. llvm::Constant *EmitCheckSourceLocation(SourceLocation Loc); /// \brief Create a basic block that will call a handler function in a /// sanitizer runtime with the provided arguments, and create a conditional /// branch to it. void EmitCheck(ArrayRef> Checked, StringRef CheckName, ArrayRef StaticArgs, ArrayRef DynamicArgs); /// \brief Emit a slow path cross-DSO CFI check which calls __cfi_slowpath /// if Cond if false. void EmitCfiSlowPathCheck(llvm::Value *Cond, llvm::ConstantInt *TypeId, llvm::Value *Ptr); /// \brief Create a basic block that will call the trap intrinsic, and emit a /// conditional branch to it, for the -ftrapv checks. void EmitTrapCheck(llvm::Value *Checked); /// \brief Emit a call to trap or debugtrap and attach function attribute /// "trap-func-name" if specified. llvm::CallInst *EmitTrapCall(llvm::Intrinsic::ID IntrID); /// \brief Create a check for a function parameter that may potentially be /// declared as non-null. void EmitNonNullArgCheck(RValue RV, QualType ArgType, SourceLocation ArgLoc, const FunctionDecl *FD, unsigned ParmNum); /// EmitCallArg - Emit a single call argument. void EmitCallArg(CallArgList &args, const Expr *E, QualType ArgType); /// EmitDelegateCallArg - We are performing a delegate call; that /// is, the current function is delegating to another one. Produce /// a r-value suitable for passing the given parameter. void EmitDelegateCallArg(CallArgList &args, const VarDecl *param, SourceLocation loc); /// SetFPAccuracy - Set the minimum required accuracy of the given floating /// point operation, expressed as the maximum relative error in ulp. void SetFPAccuracy(llvm::Value *Val, float Accuracy); private: llvm::MDNode *getRangeForLoadFromType(QualType Ty); void EmitReturnOfRValue(RValue RV, QualType Ty); void deferPlaceholderReplacement(llvm::Instruction *Old, llvm::Value *New); llvm::SmallVector, 4> DeferredReplacements; /// Set the address of a local variable. void setAddrOfLocalVar(const VarDecl *VD, Address Addr) { assert(!LocalDeclMap.count(VD) && "Decl already exists in LocalDeclMap!"); LocalDeclMap.insert({VD, Addr}); } /// ExpandTypeFromArgs - Reconstruct a structure of type \arg Ty /// from function arguments into \arg Dst. See ABIArgInfo::Expand. /// /// \param AI - The first function argument of the expansion. void ExpandTypeFromArgs(QualType Ty, LValue Dst, SmallVectorImpl::iterator &AI); /// ExpandTypeToArgs - Expand an RValue \arg RV, with the LLVM type for \arg /// Ty, into individual arguments on the provided vector \arg IRCallArgs, /// starting at index \arg IRCallArgPos. See ABIArgInfo::Expand. void ExpandTypeToArgs(QualType Ty, RValue RV, llvm::FunctionType *IRFuncTy, SmallVectorImpl &IRCallArgs, unsigned &IRCallArgPos); llvm::Value* EmitAsmInput(const TargetInfo::ConstraintInfo &Info, const Expr *InputExpr, std::string &ConstraintStr); llvm::Value* EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info, LValue InputValue, QualType InputType, std::string &ConstraintStr, SourceLocation Loc); /// \brief Attempts to statically evaluate the object size of E. If that /// fails, emits code to figure the size of E out for us. This is /// pass_object_size aware. llvm::Value *evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type, llvm::IntegerType *ResType); /// \brief Emits the size of E, as required by __builtin_object_size. This /// function is aware of pass_object_size parameters, and will act accordingly /// if E is a parameter with the pass_object_size attribute. llvm::Value *emitBuiltinObjectSize(const Expr *E, unsigned Type, llvm::IntegerType *ResType); public: #ifndef NDEBUG // Determine whether the given argument is an Objective-C method // that may have type parameters in its signature. static bool isObjCMethodWithTypeParams(const ObjCMethodDecl *method) { const DeclContext *dc = method->getDeclContext(); if (const ObjCInterfaceDecl *classDecl= dyn_cast(dc)) { return classDecl->getTypeParamListAsWritten(); } if (const ObjCCategoryDecl *catDecl = dyn_cast(dc)) { return catDecl->getTypeParamList(); } return false; } template static bool isObjCMethodWithTypeParams(const T *) { return false; } #endif /// EmitCallArgs - Emit call arguments for a function. template void EmitCallArgs(CallArgList &Args, const T *CallArgTypeInfo, llvm::iterator_range ArgRange, const FunctionDecl *CalleeDecl = nullptr, unsigned ParamsToSkip = 0) { SmallVector ArgTypes; CallExpr::const_arg_iterator Arg = ArgRange.begin(); assert((ParamsToSkip == 0 || CallArgTypeInfo) && "Can't skip parameters if type info is not provided"); if (CallArgTypeInfo) { #ifndef NDEBUG bool isGenericMethod = isObjCMethodWithTypeParams(CallArgTypeInfo); #endif // First, use the argument types that the type info knows about for (auto I = CallArgTypeInfo->param_type_begin() + ParamsToSkip, E = CallArgTypeInfo->param_type_end(); I != E; ++I, ++Arg) { assert(Arg != ArgRange.end() && "Running over edge of argument list!"); assert((isGenericMethod || ((*I)->isVariablyModifiedType() || (*I).getNonReferenceType()->isObjCRetainableType() || getContext() .getCanonicalType((*I).getNonReferenceType()) .getTypePtr() == getContext() .getCanonicalType((*Arg)->getType()) .getTypePtr())) && "type mismatch in call argument!"); ArgTypes.push_back(*I); } } // Either we've emitted all the call args, or we have a call to variadic // function. assert((Arg == ArgRange.end() || !CallArgTypeInfo || CallArgTypeInfo->isVariadic()) && "Extra arguments in non-variadic function!"); // If we still have any arguments, emit them using the type of the argument. for (auto *A : llvm::make_range(Arg, ArgRange.end())) ArgTypes.push_back(getVarArgType(A)); EmitCallArgs(Args, ArgTypes, ArgRange, CalleeDecl, ParamsToSkip); } void EmitCallArgs(CallArgList &Args, ArrayRef ArgTypes, llvm::iterator_range ArgRange, const FunctionDecl *CalleeDecl = nullptr, unsigned ParamsToSkip = 0); /// EmitPointerWithAlignment - Given an expression with a pointer /// type, emit the value and compute our best estimate of the /// alignment of the pointee. /// /// Note that this function will conservatively fall back on the type /// when it doesn't /// /// \param Source - If non-null, this will be initialized with /// information about the source of the alignment. Note that this /// function will conservatively fall back on the type when it /// doesn't recognize the expression, which means that sometimes /// /// a worst-case One /// reasonable way to use this information is when there's a /// language guarantee that the pointer must be aligned to some /// stricter value, and we're simply trying to ensure that /// sufficiently obvious uses of under-aligned objects don't get /// miscompiled; for example, a placement new into the address of /// a local variable. In such a case, it's quite reasonable to /// just ignore the returned alignment when it isn't from an /// explicit source. Address EmitPointerWithAlignment(const Expr *Addr, AlignmentSource *Source = nullptr); private: QualType getVarArgType(const Expr *Arg); const TargetCodeGenInfo &getTargetHooks() const { return CGM.getTargetCodeGenInfo(); } void EmitDeclMetadata(); BlockByrefHelpers *buildByrefHelpers(llvm::StructType &byrefType, const AutoVarEmission &emission); void AddObjCARCExceptionMetadata(llvm::Instruction *Inst); llvm::Value *GetValueForARMHint(unsigned BuiltinID); }; /// Helper class with most of the code for saving a value for a /// conditional expression cleanup. struct DominatingLLVMValue { typedef llvm::PointerIntPair saved_type; /// Answer whether the given value needs extra work to be saved. static bool needsSaving(llvm::Value *value) { // If it's not an instruction, we don't need to save. if (!isa(value)) return false; // If it's an instruction in the entry block, we don't need to save. llvm::BasicBlock *block = cast(value)->getParent(); return (block != &block->getParent()->getEntryBlock()); } /// Try to save the given value. static saved_type save(CodeGenFunction &CGF, llvm::Value *value) { if (!needsSaving(value)) return saved_type(value, false); // Otherwise, we need an alloca. auto align = CharUnits::fromQuantity( CGF.CGM.getDataLayout().getPrefTypeAlignment(value->getType())); Address alloca = CGF.CreateTempAlloca(value->getType(), align, "cond-cleanup.save"); CGF.Builder.CreateStore(value, alloca); return saved_type(alloca.getPointer(), true); } static llvm::Value *restore(CodeGenFunction &CGF, saved_type value) { // If the value says it wasn't saved, trust that it's still dominating. if (!value.getInt()) return value.getPointer(); // Otherwise, it should be an alloca instruction, as set up in save(). auto alloca = cast(value.getPointer()); return CGF.Builder.CreateAlignedLoad(alloca, alloca->getAlignment()); } }; /// A partial specialization of DominatingValue for llvm::Values that /// might be llvm::Instructions. template struct DominatingPointer : DominatingLLVMValue { typedef T *type; static type restore(CodeGenFunction &CGF, saved_type value) { return static_cast(DominatingLLVMValue::restore(CGF, value)); } }; /// A specialization of DominatingValue for Address. template <> struct DominatingValue
{ typedef Address type; struct saved_type { DominatingLLVMValue::saved_type SavedValue; CharUnits Alignment; }; static bool needsSaving(type value) { return DominatingLLVMValue::needsSaving(value.getPointer()); } static saved_type save(CodeGenFunction &CGF, type value) { return { DominatingLLVMValue::save(CGF, value.getPointer()), value.getAlignment() }; } static type restore(CodeGenFunction &CGF, saved_type value) { return Address(DominatingLLVMValue::restore(CGF, value.SavedValue), value.Alignment); } }; /// A specialization of DominatingValue for RValue. template <> struct DominatingValue { typedef RValue type; class saved_type { enum Kind { ScalarLiteral, ScalarAddress, AggregateLiteral, AggregateAddress, ComplexAddress }; llvm::Value *Value; unsigned K : 3; unsigned Align : 29; saved_type(llvm::Value *v, Kind k, unsigned a = 0) : Value(v), K(k), Align(a) {} public: static bool needsSaving(RValue value); static saved_type save(CodeGenFunction &CGF, RValue value); RValue restore(CodeGenFunction &CGF); // implementations in CGCleanup.cpp }; static bool needsSaving(type value) { return saved_type::needsSaving(value); } static saved_type save(CodeGenFunction &CGF, type value) { return saved_type::save(CGF, value); } static type restore(CodeGenFunction &CGF, saved_type value) { return value.restore(CGF); } }; } // end namespace CodeGen } // end namespace clang #endif Index: projects/clang380-import/contrib/llvm/tools/clang =================================================================== --- projects/clang380-import/contrib/llvm/tools/clang (revision 294608) +++ projects/clang380-import/contrib/llvm/tools/clang (revision 294609) Property changes on: projects/clang380-import/contrib/llvm/tools/clang ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/clang/dist:r294170-294607 Index: projects/clang380-import/contrib/llvm/tools/lli/lli.cpp =================================================================== --- projects/clang380-import/contrib/llvm/tools/lli/lli.cpp (revision 294608) +++ projects/clang380-import/contrib/llvm/tools/lli/lli.cpp (revision 294609) @@ -1,769 +1,770 @@ //===- lli.cpp - LLVM Interpreter / Dynamic compiler ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This utility provides a simple wrapper around the LLVM Execution Engines, // which allow the direct execution of LLVM programs through a Just-In-Time // compiler, or through an interpreter if no JIT is available for this platform. // //===----------------------------------------------------------------------===// #include "OrcLazyJIT.h" #include "RemoteJITUtils.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/CodeGen/LinkAllCodegenComponents.h" #include "llvm/ExecutionEngine/GenericValue.h" #include "llvm/ExecutionEngine/Interpreter.h" #include "llvm/ExecutionEngine/JITEventListener.h" #include "llvm/ExecutionEngine/MCJIT.h" #include "llvm/ExecutionEngine/ObjectCache.h" #include "llvm/ExecutionEngine/OrcMCJITReplacement.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/TypeBuilder.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Object/Archive.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DynamicLibrary.h" #include "llvm/Support/Format.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Memory.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/PluginLoader.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" #include #ifdef __CYGWIN__ #include #if defined(CYGWIN_VERSION_DLL_MAJOR) && CYGWIN_VERSION_DLL_MAJOR<1007 #define DO_NOTHING_ATEXIT 1 #endif #endif using namespace llvm; #define DEBUG_TYPE "lli" namespace { enum class JITKind { MCJIT, OrcMCJITReplacement, OrcLazy }; cl::opt InputFile(cl::desc(""), cl::Positional, cl::init("-")); cl::list InputArgv(cl::ConsumeAfter, cl::desc("...")); cl::opt ForceInterpreter("force-interpreter", cl::desc("Force interpretation: disable JIT"), cl::init(false)); cl::opt UseJITKind("jit-kind", cl::desc("Choose underlying JIT kind."), cl::init(JITKind::MCJIT), cl::values( clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"), clEnumValN(JITKind::OrcMCJITReplacement, "orc-mcjit", "Orc-based MCJIT replacement"), clEnumValN(JITKind::OrcLazy, "orc-lazy", "Orc-based lazy JIT."), clEnumValEnd)); // The MCJIT supports building for a target address space separate from // the JIT compilation process. Use a forked process and a copying // memory manager with IPC to execute using this functionality. cl::opt RemoteMCJIT("remote-mcjit", cl::desc("Execute MCJIT'ed code in a separate process."), cl::init(false)); // Manually specify the child process for remote execution. This overrides // the simulated remote execution that allocates address space for child // execution. The child process will be executed and will communicate with // lli via stdin/stdout pipes. cl::opt ChildExecPath("mcjit-remote-process", cl::desc("Specify the filename of the process to launch " "for remote MCJIT execution. If none is specified," "\n\tremote execution will be simulated in-process."), cl::value_desc("filename"), cl::init("")); // Determine optimization level. cl::opt OptLevel("O", cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] " "(default = '-O2')"), cl::Prefix, cl::ZeroOrMore, cl::init(' ')); cl::opt TargetTriple("mtriple", cl::desc("Override target triple for module")); cl::opt MArch("march", cl::desc("Architecture to generate assembly for (see --version)")); cl::opt MCPU("mcpu", cl::desc("Target a specific cpu type (-mcpu=help for details)"), cl::value_desc("cpu-name"), cl::init("")); cl::list MAttrs("mattr", cl::CommaSeparated, cl::desc("Target specific attributes (-mattr=help for details)"), cl::value_desc("a1,+a2,-a3,...")); cl::opt EntryFunc("entry-function", cl::desc("Specify the entry function (default = 'main') " "of the executable"), cl::value_desc("function"), cl::init("main")); cl::list ExtraModules("extra-module", cl::desc("Extra modules to be loaded"), cl::value_desc("input bitcode")); cl::list ExtraObjects("extra-object", cl::desc("Extra object files to be loaded"), cl::value_desc("input object")); cl::list ExtraArchives("extra-archive", cl::desc("Extra archive files to be loaded"), cl::value_desc("input archive")); cl::opt EnableCacheManager("enable-cache-manager", cl::desc("Use cache manager to save/load mdoules"), cl::init(false)); cl::opt ObjectCacheDir("object-cache-dir", cl::desc("Directory to store cached object files " "(must be user writable)"), cl::init("")); cl::opt FakeArgv0("fake-argv0", cl::desc("Override the 'argv[0]' value passed into the executing" " program"), cl::value_desc("executable")); cl::opt DisableCoreFiles("disable-core-files", cl::Hidden, cl::desc("Disable emission of core files if possible")); cl::opt NoLazyCompilation("disable-lazy-compilation", cl::desc("Disable JIT lazy compilation"), cl::init(false)); cl::opt RelocModel("relocation-model", cl::desc("Choose relocation model"), cl::init(Reloc::Default), cl::values( clEnumValN(Reloc::Default, "default", "Target default relocation model"), clEnumValN(Reloc::Static, "static", "Non-relocatable code"), clEnumValN(Reloc::PIC_, "pic", "Fully relocatable, position independent code"), clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic", "Relocatable external references, non-relocatable code"), clEnumValEnd)); cl::opt CMModel("code-model", cl::desc("Choose code model"), cl::init(CodeModel::JITDefault), cl::values(clEnumValN(CodeModel::JITDefault, "default", "Target default JIT code model"), clEnumValN(CodeModel::Small, "small", "Small code model"), clEnumValN(CodeModel::Kernel, "kernel", "Kernel code model"), clEnumValN(CodeModel::Medium, "medium", "Medium code model"), clEnumValN(CodeModel::Large, "large", "Large code model"), clEnumValEnd)); cl::opt GenerateSoftFloatCalls("soft-float", cl::desc("Generate software floating point library calls"), cl::init(false)); cl::opt FloatABIForCalls("float-abi", cl::desc("Choose float ABI type"), cl::init(FloatABI::Default), cl::values( clEnumValN(FloatABI::Default, "default", "Target default float ABI type"), clEnumValN(FloatABI::Soft, "soft", "Soft float ABI (implied by -soft-float)"), clEnumValN(FloatABI::Hard, "hard", "Hard float ABI (uses FP registers)"), clEnumValEnd)); } //===----------------------------------------------------------------------===// // Object cache // // This object cache implementation writes cached objects to disk to the // directory specified by CacheDir, using a filename provided in the module // descriptor. The cache tries to load a saved object using that path if the // file exists. CacheDir defaults to "", in which case objects are cached // alongside their originating bitcodes. // class LLIObjectCache : public ObjectCache { public: LLIObjectCache(const std::string& CacheDir) : CacheDir(CacheDir) { // Add trailing '/' to cache dir if necessary. if (!this->CacheDir.empty() && this->CacheDir[this->CacheDir.size() - 1] != '/') this->CacheDir += '/'; } ~LLIObjectCache() override {} void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) override { const std::string ModuleID = M->getModuleIdentifier(); std::string CacheName; if (!getCacheFilename(ModuleID, CacheName)) return; if (!CacheDir.empty()) { // Create user-defined cache dir. SmallString<128> dir(sys::path::parent_path(CacheName)); sys::fs::create_directories(Twine(dir)); } std::error_code EC; raw_fd_ostream outfile(CacheName, EC, sys::fs::F_None); outfile.write(Obj.getBufferStart(), Obj.getBufferSize()); outfile.close(); } std::unique_ptr getObject(const Module* M) override { const std::string ModuleID = M->getModuleIdentifier(); std::string CacheName; if (!getCacheFilename(ModuleID, CacheName)) return nullptr; // Load the object from the cache filename ErrorOr> IRObjectBuffer = MemoryBuffer::getFile(CacheName.c_str(), -1, false); // If the file isn't there, that's OK. if (!IRObjectBuffer) return nullptr; // MCJIT will want to write into this buffer, and we don't want that // because the file has probably just been mmapped. Instead we make // a copy. The filed-based buffer will be released when it goes // out of scope. return MemoryBuffer::getMemBufferCopy(IRObjectBuffer.get()->getBuffer()); } private: std::string CacheDir; bool getCacheFilename(const std::string &ModID, std::string &CacheName) { std::string Prefix("file:"); size_t PrefixLength = Prefix.length(); if (ModID.substr(0, PrefixLength) != Prefix) return false; std::string CacheSubdir = ModID.substr(PrefixLength); #if defined(_WIN32) // Transform "X:\foo" => "/X\foo" for convenience. if (isalpha(CacheSubdir[0]) && CacheSubdir[1] == ':') { CacheSubdir[1] = CacheSubdir[0]; CacheSubdir[0] = '/'; } #endif CacheName = CacheDir + CacheSubdir; size_t pos = CacheName.rfind('.'); CacheName.replace(pos, CacheName.length() - pos, ".o"); return true; } }; static ExecutionEngine *EE = nullptr; static LLIObjectCache *CacheManager = nullptr; static void do_shutdown() { // Cygwin-1.5 invokes DLL's dtors before atexit handler. #ifndef DO_NOTHING_ATEXIT delete EE; if (CacheManager) delete CacheManager; llvm_shutdown(); #endif } // On Mingw and Cygwin, an external symbol named '__main' is called from the // generated 'main' function to allow static intialization. To avoid linking // problems with remote targets (because lli's remote target support does not // currently handle external linking) we add a secondary module which defines // an empty '__main' function. static void addCygMingExtraModule(ExecutionEngine *EE, LLVMContext &Context, StringRef TargetTripleStr) { IRBuilder<> Builder(Context); Triple TargetTriple(TargetTripleStr); // Create a new module. std::unique_ptr M = make_unique("CygMingHelper", Context); M->setTargetTriple(TargetTripleStr); // Create an empty function named "__main". Function *Result; if (TargetTriple.isArch64Bit()) { Result = Function::Create( TypeBuilder::get(Context), GlobalValue::ExternalLinkage, "__main", M.get()); } else { Result = Function::Create( TypeBuilder::get(Context), GlobalValue::ExternalLinkage, "__main", M.get()); } BasicBlock *BB = BasicBlock::Create(Context, "__main", Result); Builder.SetInsertPoint(BB); Value *ReturnVal; if (TargetTriple.isArch64Bit()) ReturnVal = ConstantInt::get(Context, APInt(64, 0)); else ReturnVal = ConstantInt::get(Context, APInt(32, 0)); Builder.CreateRet(ReturnVal); // Add this new module to the ExecutionEngine. EE->addModule(std::move(M)); } CodeGenOpt::Level getOptLevel() { switch (OptLevel) { default: errs() << "lli: Invalid optimization level.\n"; exit(1); case '0': return CodeGenOpt::None; case '1': return CodeGenOpt::Less; case ' ': case '2': return CodeGenOpt::Default; case '3': return CodeGenOpt::Aggressive; } llvm_unreachable("Unrecognized opt level."); } //===----------------------------------------------------------------------===// // main Driver function // int main(int argc, char **argv, char * const *envp) { sys::PrintStackTraceOnErrorSignal(); PrettyStackTraceProgram X(argc, argv); LLVMContext &Context = getGlobalContext(); atexit(do_shutdown); // Call llvm_shutdown() on exit. // If we have a native target, initialize it to ensure it is linked in and // usable by the JIT. InitializeNativeTarget(); InitializeNativeTargetAsmPrinter(); InitializeNativeTargetAsmParser(); cl::ParseCommandLineOptions(argc, argv, "llvm interpreter & dynamic compiler\n"); // If the user doesn't want core files, disable them. if (DisableCoreFiles) sys::Process::PreventCoreFiles(); // Load the bitcode... SMDiagnostic Err; std::unique_ptr Owner = parseIRFile(InputFile, Err, Context); Module *Mod = Owner.get(); if (!Mod) { Err.print(argv[0], errs()); return 1; } if (UseJITKind == JITKind::OrcLazy) return runOrcLazyJIT(std::move(Owner), argc, argv); if (EnableCacheManager) { std::string CacheName("file:"); CacheName.append(InputFile); Mod->setModuleIdentifier(CacheName); } // If not jitting lazily, load the whole bitcode file eagerly too. if (NoLazyCompilation) { if (std::error_code EC = Mod->materializeAll()) { errs() << argv[0] << ": bitcode didn't read correctly.\n"; errs() << "Reason: " << EC.message() << "\n"; exit(1); } } std::string ErrorMsg; EngineBuilder builder(std::move(Owner)); builder.setMArch(MArch); builder.setMCPU(MCPU); builder.setMAttrs(MAttrs); builder.setRelocationModel(RelocModel); builder.setCodeModel(CMModel); builder.setErrorStr(&ErrorMsg); builder.setEngineKind(ForceInterpreter ? EngineKind::Interpreter : EngineKind::JIT); builder.setUseOrcMCJITReplacement(UseJITKind == JITKind::OrcMCJITReplacement); // If we are supposed to override the target triple, do so now. if (!TargetTriple.empty()) Mod->setTargetTriple(Triple::normalize(TargetTriple)); // Enable MCJIT if desired. RTDyldMemoryManager *RTDyldMM = nullptr; if (!ForceInterpreter) { if (RemoteMCJIT) RTDyldMM = new ForwardingMemoryManager(); else RTDyldMM = new SectionMemoryManager(); // Deliberately construct a temp std::unique_ptr to pass in. Do not null out // RTDyldMM: We still use it below, even though we don't own it. builder.setMCJITMemoryManager( std::unique_ptr(RTDyldMM)); } else if (RemoteMCJIT) { errs() << "error: Remote process execution does not work with the " "interpreter.\n"; exit(1); } builder.setOptLevel(getOptLevel()); TargetOptions Options; if (FloatABIForCalls != FloatABI::Default) Options.FloatABIType = FloatABIForCalls; builder.setTargetOptions(Options); EE = builder.create(); if (!EE) { if (!ErrorMsg.empty()) errs() << argv[0] << ": error creating EE: " << ErrorMsg << "\n"; else errs() << argv[0] << ": unknown error creating EE!\n"; exit(1); } if (EnableCacheManager) { CacheManager = new LLIObjectCache(ObjectCacheDir); EE->setObjectCache(CacheManager); } // Load any additional modules specified on the command line. for (unsigned i = 0, e = ExtraModules.size(); i != e; ++i) { std::unique_ptr XMod = parseIRFile(ExtraModules[i], Err, Context); if (!XMod) { Err.print(argv[0], errs()); return 1; } if (EnableCacheManager) { std::string CacheName("file:"); CacheName.append(ExtraModules[i]); XMod->setModuleIdentifier(CacheName); } EE->addModule(std::move(XMod)); } for (unsigned i = 0, e = ExtraObjects.size(); i != e; ++i) { ErrorOr> Obj = object::ObjectFile::createObjectFile(ExtraObjects[i]); if (!Obj) { Err.print(argv[0], errs()); return 1; } object::OwningBinary &O = Obj.get(); EE->addObjectFile(std::move(O)); } for (unsigned i = 0, e = ExtraArchives.size(); i != e; ++i) { ErrorOr> ArBufOrErr = MemoryBuffer::getFileOrSTDIN(ExtraArchives[i]); if (!ArBufOrErr) { Err.print(argv[0], errs()); return 1; } std::unique_ptr &ArBuf = ArBufOrErr.get(); ErrorOr> ArOrErr = object::Archive::create(ArBuf->getMemBufferRef()); if (std::error_code EC = ArOrErr.getError()) { errs() << EC.message(); return 1; } std::unique_ptr &Ar = ArOrErr.get(); object::OwningBinary OB(std::move(Ar), std::move(ArBuf)); EE->addArchive(std::move(OB)); } // If the target is Cygwin/MingW and we are generating remote code, we // need an extra module to help out with linking. if (RemoteMCJIT && Triple(Mod->getTargetTriple()).isOSCygMing()) { addCygMingExtraModule(EE, Context, Mod->getTargetTriple()); } // The following functions have no effect if their respective profiling // support wasn't enabled in the build configuration. EE->RegisterJITEventListener( JITEventListener::createOProfileJITEventListener()); EE->RegisterJITEventListener( JITEventListener::createIntelJITEventListener()); if (!NoLazyCompilation && RemoteMCJIT) { errs() << "warning: remote mcjit does not support lazy compilation\n"; NoLazyCompilation = true; } EE->DisableLazyCompilation(NoLazyCompilation); // If the user specifically requested an argv[0] to pass into the program, // do it now. if (!FakeArgv0.empty()) { InputFile = static_cast(FakeArgv0); } else { // Otherwise, if there is a .bc suffix on the executable strip it off, it // might confuse the program. if (StringRef(InputFile).endswith(".bc")) InputFile.erase(InputFile.length() - 3); } // Add the module's name to the start of the vector of arguments to main(). InputArgv.insert(InputArgv.begin(), InputFile); // Call the main function from M as if its signature were: // int main (int argc, char **argv, const char **envp) // using the contents of Args to determine argc & argv, and the contents of // EnvVars to determine envp. // Function *EntryFn = Mod->getFunction(EntryFunc); if (!EntryFn) { errs() << '\'' << EntryFunc << "\' function not found in module.\n"; return -1; } // Reset errno to zero on entry to main. errno = 0; int Result; // Sanity check use of remote-jit: LLI currently only supports use of the // remote JIT on Unix platforms. if (RemoteMCJIT) { #ifndef LLVM_ON_UNIX errs() << "Warning: host does not support external remote targets.\n" << " Defaulting to local execution execution\n"; return -1; #else if (ChildExecPath.empty()) { errs() << "-remote-mcjit requires -mcjit-remote-process.\n"; exit(1); } else if (!sys::fs::can_execute(ChildExecPath)) { errs() << "Unable to find usable child executable: '" << ChildExecPath << "'\n"; return -1; } #endif } if (!RemoteMCJIT) { // If the program doesn't explicitly call exit, we will need the Exit // function later on to make an explicit call, so get the function now. Constant *Exit = Mod->getOrInsertFunction("exit", Type::getVoidTy(Context), Type::getInt32Ty(Context), nullptr); // Run static constructors. if (!ForceInterpreter) { // Give MCJIT a chance to apply relocations and set page permissions. EE->finalizeObject(); } EE->runStaticConstructorsDestructors(false); // Trigger compilation separately so code regions that need to be // invalidated will be known. (void)EE->getPointerToFunction(EntryFn); // Clear instruction cache before code will be executed. if (RTDyldMM) static_cast(RTDyldMM)->invalidateInstructionCache(); // Run main. Result = EE->runFunctionAsMain(EntryFn, InputArgv, envp); // Run static destructors. EE->runStaticConstructorsDestructors(true); // If the program didn't call exit explicitly, we should call it now. // This ensures that any atexit handlers get called correctly. if (Function *ExitF = dyn_cast(Exit)) { std::vector Args; GenericValue ResultGV; ResultGV.IntVal = APInt(32, Result); Args.push_back(ResultGV); EE->runFunction(ExitF, Args); errs() << "ERROR: exit(" << Result << ") returned!\n"; abort(); } else { errs() << "ERROR: exit defined with wrong prototype!\n"; abort(); } } else { // else == "if (RemoteMCJIT)" // Remote target MCJIT doesn't (yet) support static constructors. No reason // it couldn't. This is a limitation of the LLI implemantation, not the // MCJIT itself. FIXME. // Lanch the remote process and get a channel to it. std::unique_ptr C = launchRemote(); if (!C) { errs() << "Failed to launch remote JIT.\n"; exit(1); } // Create a remote target client running over the channel. typedef orc::remote::OrcRemoteTargetClient MyRemote; ErrorOr R = MyRemote::Create(*C); if (!R) { errs() << "Could not create remote: " << R.getError().message() << "\n"; exit(1); } // Create a remote memory manager. std::unique_ptr RemoteMM; if (auto EC = R->createRemoteMemoryManager(RemoteMM)) { errs() << "Could not create remote memory manager: " << EC.message() << "\n"; exit(1); } // Forward MCJIT's memory manager calls to the remote memory manager. static_cast(RTDyldMM)->setMemMgr( std::move(RemoteMM)); // Forward MCJIT's symbol resolution calls to the remote. static_cast(RTDyldMM)->setResolver( orc::createLambdaResolver( [&](const std::string &Name) { orc::TargetAddress Addr = 0; if (auto EC = R->getSymbolAddress(Addr, Name)) { errs() << "Failure during symbol lookup: " << EC.message() << "\n"; exit(1); } return RuntimeDyld::SymbolInfo(Addr, JITSymbolFlags::Exported); }, [](const std::string &Name) { return nullptr; } )); // Grab the target address of the JIT'd main function on the remote and call // it. // FIXME: argv and envp handling. orc::TargetAddress Entry = EE->getFunctionAddress(EntryFn->getName().str()); EE->finalizeObject(); DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at 0x" << format("%llx", Entry) << "\n"); if (auto EC = R->callIntVoid(Result, Entry)) errs() << "ERROR: " << EC.message() << "\n"; // Like static constructors, the remote target MCJIT support doesn't handle // this yet. It could. FIXME. // Delete the EE - we need to tear it down *before* we terminate the session // with the remote, otherwise it'll crash when it tries to release resources // on a remote that has already been disconnected. delete EE; EE = nullptr; // Signal the remote target that we're done JITing. R->terminateSession(); } return Result; } std::unique_ptr launchRemote() { #ifndef LLVM_ON_UNIX llvm_unreachable("launchRemote not supported on non-Unix platforms"); #else int PipeFD[2][2]; pid_t ChildPID; // Create two pipes. if (pipe(PipeFD[0]) != 0 || pipe(PipeFD[1]) != 0) perror("Error creating pipe: "); ChildPID = fork(); if (ChildPID == 0) { // In the child... // Close the parent ends of the pipes close(PipeFD[0][1]); close(PipeFD[1][0]); // Execute the child process. std::unique_ptr ChildPath, ChildIn, ChildOut; { ChildPath.reset(new char[ChildExecPath.size() + 1]); std::copy(ChildExecPath.begin(), ChildExecPath.end(), &ChildPath[0]); ChildPath[ChildExecPath.size()] = '\0'; - std::string ChildInStr = std::to_string(PipeFD[0][0]); + std::string ChildInStr = utostr(PipeFD[0][0]); ChildIn.reset(new char[ChildInStr.size() + 1]); std::copy(ChildInStr.begin(), ChildInStr.end(), &ChildIn[0]); ChildIn[ChildInStr.size()] = '\0'; - std::string ChildOutStr = std::to_string(PipeFD[1][1]); + std::string ChildOutStr = utostr(PipeFD[1][1]); ChildOut.reset(new char[ChildOutStr.size() + 1]); std::copy(ChildOutStr.begin(), ChildOutStr.end(), &ChildOut[0]); ChildOut[ChildOutStr.size()] = '\0'; } char * const args[] = { &ChildPath[0], &ChildIn[0], &ChildOut[0], nullptr }; int rc = execv(ChildExecPath.c_str(), args); if (rc != 0) perror("Error executing child process: "); llvm_unreachable("Error executing child process"); } // else we're the parent... // Close the child ends of the pipes close(PipeFD[0][0]); close(PipeFD[1][1]); // Return an RPC channel connected to our end of the pipes. return llvm::make_unique(PipeFD[1][0], PipeFD[0][1]); #endif } Index: projects/clang380-import/contrib/llvm =================================================================== --- projects/clang380-import/contrib/llvm (revision 294608) +++ projects/clang380-import/contrib/llvm (revision 294609) Property changes on: projects/clang380-import/contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/llvm/dist:r294170-294607 Index: projects/clang380-import/lib/clang/include/clang/Basic/Version.inc =================================================================== --- projects/clang380-import/lib/clang/include/clang/Basic/Version.inc (revision 294608) +++ projects/clang380-import/lib/clang/include/clang/Basic/Version.inc (revision 294609) @@ -1,10 +1,10 @@ /* $FreeBSD$ */ #define CLANG_VERSION 3.8.0 #define CLANG_VERSION_MAJOR 3 #define CLANG_VERSION_MINOR 8 #define CLANG_VERSION_PATCHLEVEL 0 #define CLANG_VENDOR "FreeBSD " -#define SVN_REVISION "257836" +#define SVN_REVISION "258549"